kernel-rt.patch

   1 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
   2 index 3a3b30ac2a75..9e0745cafbd8 100644
   3 --- a/Documentation/sysrq.txt
   4 +++ b/Documentation/sysrq.txt
   5 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
   6  On other - If you know of the key combos for other architectures, please
   7             let me know so I can add them to this section.
   8
   9 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
  10 -
  11 +On all -  write a character to /proc/sysrq-trigger, e.g.:
  12                 echo t > /proc/sysrq-trigger
  13
  14 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
  15 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
  16 +        Send an ICMP echo request with this pattern plus the particular
  17 +        SysRq command key. Example:
  18 +               # ping -c1 -s57 -p0102030468
  19 +        will trigger the SysRq-H (help) command.
  20 +
  21 +
  22  *  What are the 'command' keys?
  23  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  24  'b'     - Will immediately reboot the system without syncing or unmounting
  25 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
  26 new file mode 100644
  27 index 000000000000..6f2aeabf7faa
  28 --- /dev/null
  29 +++ b/Documentation/trace/histograms.txt
  30 @@ -0,0 +1,186 @@
  31 +               Using the Linux Kernel Latency Histograms
  32 +
  33 +
  34 +This document gives a short explanation how to enable, configure and use
  35 +latency histograms. Latency histograms are primarily relevant in the
  36 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
  37 +and are used in the quality management of the Linux real-time
  38 +capabilities.
  39 +
  40 +
  41 +* Purpose of latency histograms
  42 +
  43 +A latency histogram continuously accumulates the frequencies of latency
  44 +data. There are two types of histograms
  45 +- potential sources of latencies
  46 +- effective latencies
  47 +
  48 +
  49 +* Potential sources of latencies
  50 +
  51 +Potential sources of latencies are code segments where interrupts,
  52 +preemption or both are disabled (aka critical sections). To create
  53 +histograms of potential sources of latency, the kernel stores the time
  54 +stamp at the start of a critical section, determines the time elapsed
  55 +when the end of the section is reached, and increments the frequency
  56 +counter of that latency value - irrespective of whether any concurrently
  57 +running process is affected by latency or not.
  58 +- Configuration items (in the Kernel hacking/Tracers submenu)
  59 +  CONFIG_INTERRUPT_OFF_LATENCY
  60 +  CONFIG_PREEMPT_OFF_LATENCY
  61 +
  62 +
  63 +* Effective latencies
  64 +
  65 +Effective latencies are actually occuring during wakeup of a process. To
  66 +determine effective latencies, the kernel stores the time stamp when a
  67 +process is scheduled to be woken up, and determines the duration of the
  68 +wakeup time shortly before control is passed over to this process. Note
  69 +that the apparent latency in user space may be somewhat longer, since the
  70 +process may be interrupted after control is passed over to it but before
  71 +the execution in user space takes place. Simply measuring the interval
  72 +between enqueuing and wakeup may also not appropriate in cases when a
  73 +process is scheduled as a result of a timer expiration. The timer may have
  74 +missed its deadline, e.g. due to disabled interrupts, but this latency
  75 +would not be registered. Therefore, the offsets of missed timers are
  76 +recorded in a separate histogram. If both wakeup latency and missed timer
  77 +offsets are configured and enabled, a third histogram may be enabled that
  78 +records the overall latency as a sum of the timer latency, if any, and the
  79 +wakeup latency. This histogram is called "timerandwakeup".
  80 +- Configuration items (in the Kernel hacking/Tracers submenu)
  81 +  CONFIG_WAKEUP_LATENCY
  82 +  CONFIG_MISSED_TIMER_OFSETS
  83 +
  84 +
  85 +* Usage
  86 +
  87 +The interface to the administration of the latency histograms is located
  88 +in the debugfs file system. To mount it, either enter
  89 +
  90 +mount -t sysfs nodev /sys
  91 +mount -t debugfs nodev /sys/kernel/debug
  92 +
  93 +from shell command line level, or add
  94 +
  95 +nodev  /sys                    sysfs   defaults        0 0
  96 +nodev  /sys/kernel/debug       debugfs defaults        0 0
  97 +
  98 +to the file /etc/fstab. All latency histogram related files are then
  99 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
 100 +particular histogram type is enabled by writing non-zero to the related
 101 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
 102 +Select "preemptirqsoff" for the histograms of potential sources of
 103 +latencies and "wakeup" for histograms of effective latencies etc. The
 104 +histogram data - one per CPU - are available in the files
 105 +
 106 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
 107 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
 108 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
 109 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
 110 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
 111 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
 112 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
 113 +
 114 +The histograms are reset by writing non-zero to the file "reset" in a
 115 +particular latency directory. To reset all latency data, use
 116 +
 117 +#!/bin/sh
 118 +
 119 +TRACINGDIR=/sys/kernel/debug/tracing
 120 +HISTDIR=$TRACINGDIR/latency_hist
 121 +
 122 +if test -d $HISTDIR
 123 +then
 124 +  cd $HISTDIR
 125 +  for i in `find . | grep /reset$`
 126 +  do
 127 +    echo 1 >$i
 128 +  done
 129 +fi
 130 +
 131 +
 132 +* Data format
 133 +
 134 +Latency data are stored with a resolution of one microsecond. The
 135 +maximum latency is 10,240 microseconds. The data are only valid, if the
 136 +overflow register is empty. Every output line contains the latency in
 137 +microseconds in the first row and the number of samples in the second
 138 +row. To display only lines with a positive latency count, use, for
 139 +example,
 140 +
 141 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
 142 +
 143 +#Minimum latency: 0 microseconds.
 144 +#Average latency: 0 microseconds.
 145 +#Maximum latency: 25 microseconds.
 146 +#Total samples: 3104770694
 147 +#There are 0 samples greater or equal than 10240 microseconds
 148 +#usecs          samples
 149 +    0        2984486876
 150 +    1          49843506
 151 +    2          58219047
 152 +    3           5348126
 153 +    4           2187960
 154 +    5           3388262
 155 +    6            959289
 156 +    7            208294
 157 +    8             40420
 158 +    9              4485
 159 +   10             14918
 160 +   11             18340
 161 +   12             25052
 162 +   13             19455
 163 +   14              5602
 164 +   15               969
 165 +   16                47
 166 +   17                18
 167 +   18                14
 168 +   19                 1
 169 +   20                 3
 170 +   21                 2
 171 +   22                 5
 172 +   23                 2
 173 +   25                 1
 174 +
 175 +
 176 +* Wakeup latency of a selected process
 177 +
 178 +To only collect wakeup latency data of a particular process, write the
 179 +PID of the requested process to
 180 +
 181 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
 182 +
 183 +PIDs are not considered, if this variable is set to 0.
 184 +
 185 +
 186 +* Details of the process with the highest wakeup latency so far
 187 +
 188 +Selected data of the process that suffered from the highest wakeup
 189 +latency that occurred in a particular CPU are available in the file
 190 +
 191 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
 192 +
 193 +In addition, other relevant system data at the time when the
 194 +latency occurred are given.
 195 +
 196 +The format of the data is (all in one line):
 197 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
 198 +<- <PID> <Priority> <Command> <Timestamp>
 199 +
 200 +The value of <Timeroffset> is only relevant in the combined timer
 201 +and wakeup latency recording. In the wakeup recording, it is
 202 +always 0, in the missed_timer_offsets recording, it is the same
 203 +as <Latency>.
 204 +
 205 +When retrospectively searching for the origin of a latency and
 206 +tracing was not enabled, it may be helpful to know the name and
 207 +some basic data of the task that (finally) was switching to the
 208 +late real-tlme task. In addition to the victim's data, also the
 209 +data of the possible culprit are therefore displayed after the
 210 +"<-" symbol.
 211 +
 212 +Finally, the timestamp of the time when the latency occurred
 213 +in <seconds>.<microseconds> after the most recent system boot
 214 +is provided.
 215 +
 216 +These data are also reset when the wakeup histogram is reset.
 217 diff --git a/MAINTAINERS b/MAINTAINERS
 218 index 63cefa62324c..be0ea1e5c4cc 100644
 219 --- a/MAINTAINERS
 220 +++ b/MAINTAINERS
 221 @@ -5196,6 +5196,23 @@ F:       fs/fuse/
 222  F:     include/uapi/linux/fuse.h
 223  F:     Documentation/filesystems/fuse.txt
 224
 225 +FUTEX SUBSYSTEM
 226 +M:     Thomas Gleixner <tglx@linutronix.de>
 227 +M:     Ingo Molnar <mingo@redhat.com>
 228 +R:     Peter Zijlstra <peterz@infradead.org>
 229 +R:     Darren Hart <dvhart@infradead.org>
 230 +L:     linux-kernel@vger.kernel.org
 231 +T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
 232 +S:     Maintained
 233 +F:     kernel/futex.c
 234 +F:     kernel/futex_compat.c
 235 +F:     include/asm-generic/futex.h
 236 +F:     include/linux/futex.h
 237 +F:     include/uapi/linux/futex.h
 238 +F:     tools/testing/selftests/futex/
 239 +F:     tools/perf/bench/futex*
 240 +F:     Documentation/*futex*
 241 +
 242  FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
 243  M:     Rik Faith <faith@cs.unc.edu>
 244  L:     linux-scsi@vger.kernel.org
 245 diff --git a/arch/Kconfig b/arch/Kconfig
 246 index 659bdd079277..099fc0f5155e 100644
 247 --- a/arch/Kconfig
 248 +++ b/arch/Kconfig
 249 @@ -9,6 +9,7 @@ config OPROFILE
 250         tristate "OProfile system profiling"
 251         depends on PROFILING
 252         depends on HAVE_OPROFILE
 253 +       depends on !PREEMPT_RT_FULL
 254         select RING_BUFFER
 255         select RING_BUFFER_ALLOW_SWAP
 256         help
 257 @@ -52,6 +53,7 @@ config KPROBES
 258  config JUMP_LABEL
 259         bool "Optimize very unlikely/likely branches"
 260         depends on HAVE_ARCH_JUMP_LABEL
 261 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
 262         help
 263           This option enables a transparent branch optimization that
 264          makes certain almost-always-true or almost-always-false branch
 265 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 266 index b5d529fdffab..5715844e83e3 100644
 267 --- a/arch/arm/Kconfig
 268 +++ b/arch/arm/Kconfig
 269 @@ -36,7 +36,7 @@ config ARM
 270         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 271         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 272         select HAVE_ARCH_HARDENED_USERCOPY
 273 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
 274 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
 275         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 276         select HAVE_ARCH_MMAP_RND_BITS if MMU
 277         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 278 @@ -75,6 +75,7 @@ config ARM
 279         select HAVE_PERF_EVENTS
 280         select HAVE_PERF_REGS
 281         select HAVE_PERF_USER_STACK_DUMP
 282 +       select HAVE_PREEMPT_LAZY
 283         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 284         select HAVE_REGS_AND_STACK_ACCESS_API
 285         select HAVE_SYSCALL_TRACEPOINTS
 286 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
 287 index e53638c8ed8a..6095a1649865 100644
 288 --- a/arch/arm/include/asm/irq.h
 289 +++ b/arch/arm/include/asm/irq.h
 290 @@ -22,6 +22,8 @@
 291  #endif
 292
 293  #ifndef __ASSEMBLY__
 294 +#include <linux/cpumask.h>
 295 +
 296  struct irqaction;
 297  struct pt_regs;
 298  extern void migrate_irqs(void);
 299 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
 300 index 12ebfcc1d539..c962084605bc 100644
 301 --- a/arch/arm/include/asm/switch_to.h
 302 +++ b/arch/arm/include/asm/switch_to.h
 303 @@ -3,6 +3,13 @@
 304
 305  #include <linux/thread_info.h>
 306
 307 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
 308 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
 309 +#else
 310 +static inline void
 311 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
 312 +#endif
 313 +
 314  /*
 315   * For v7 SMP cores running a preemptible kernel we may be pre-empted
 316   * during a TLB maintenance operation, so execute an inner-shareable dsb
 317 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 318  #define switch_to(prev,next,last)                                      \
 319  do {                                                                   \
 320         __complete_pending_tlbi();                                      \
 321 +       switch_kmaps(prev, next);                                       \
 322         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
 323  } while (0)
 324
 325 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
 326 index 776757d1604a..1f36a4eccc72 100644
 327 --- a/arch/arm/include/asm/thread_info.h
 328 +++ b/arch/arm/include/asm/thread_info.h
 329 @@ -49,6 +49,7 @@ struct cpu_context_save {
 330  struct thread_info {
 331         unsigned long           flags;          /* low level flags */
 332         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 333 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 334         mm_segment_t            addr_limit;     /* address limit */
 335         struct task_struct      *task;          /* main task structure */
 336         __u32                   cpu;            /* cpu */
 337 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 338  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
 339  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
 340  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
 341 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
 342 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
 343 +#define TIF_NEED_RESCHED_LAZY  7
 344
 345  #define TIF_NOHZ               12      /* in adaptive nohz mode */
 346  #define TIF_USING_IWMMXT       17
 347 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 348  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 349  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 350  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 351 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 352  #define _TIF_UPROBE            (1 << TIF_UPROBE)
 353  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 354  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 355 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 356   * Change these and you break ASM code in entry-common.S
 357   */
 358  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 359 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
 360 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 361 +                                _TIF_NEED_RESCHED_LAZY)
 362
 363  #endif /* __KERNEL__ */
 364  #endif /* __ASM_ARM_THREAD_INFO_H */
 365 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
 366 index 608008229c7d..3866da3f7bb7 100644
 367 --- a/arch/arm/kernel/asm-offsets.c
 368 +++ b/arch/arm/kernel/asm-offsets.c
 369 @@ -65,6 +65,7 @@ int main(void)
 370    BLANK();
 371    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 372    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 373 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 374    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 375    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 376    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 377 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
 378 index 9f157e7c51e7..468e224d76aa 100644
 379 --- a/arch/arm/kernel/entry-armv.S
 380 +++ b/arch/arm/kernel/entry-armv.S
 381 @@ -220,11 +220,18 @@ __irq_svc:
 382
 383  #ifdef CONFIG_PREEMPT
 384         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 385 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 386         teq     r8, #0                          @ if preempt count != 0
 387 +       bne     1f                              @ return from exeption
 388 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 389 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 390 +       blne    svc_preempt                     @ preempt!
 391 +
 392 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 393 +       teq     r8, #0                          @ if preempt lazy count != 0
 394         movne   r0, #0                          @ force flags to 0
 395 -       tst     r0, #_TIF_NEED_RESCHED
 396 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 397         blne    svc_preempt
 398 +1:
 399  #endif
 400
 401         svc_exit r5, irq = 1                    @ return from exception
 402 @@ -239,8 +246,14 @@ svc_preempt:
 403  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 404         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 405         tst     r0, #_TIF_NEED_RESCHED
 406 +       bne     1b
 407 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 408         reteq   r8                              @ go again
 409 -       b       1b
 410 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 411 +       teq     r0, #0                          @ if preempt lazy count != 0
 412 +       beq     1b
 413 +       ret     r8                              @ go again
 414 +
 415  #endif
 416
 417  __und_fault:
 418 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
 419 index 10c3283d6c19..8872937862cc 100644
 420 --- a/arch/arm/kernel/entry-common.S
 421 +++ b/arch/arm/kernel/entry-common.S
 422 @@ -36,7 +36,9 @@ ret_fast_syscall:
 423   UNWIND(.cantunwind    )
 424         disable_irq_notrace                     @ disable interrupts
 425         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 426 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 427 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 428 +       bne     fast_work_pending
 429 +       tst     r1, #_TIF_SECCOMP
 430         bne     fast_work_pending
 431
 432         /* perform architecture specific actions before user return */
 433 @@ -62,8 +64,11 @@ ret_fast_syscall:
 434         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 435         disable_irq_notrace                     @ disable interrupts
 436         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 437 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 438 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 439 +       bne     do_slower_path
 440 +       tst     r1, #_TIF_SECCOMP
 441         beq     no_work_pending
 442 +do_slower_path:
 443   UNWIND(.fnend         )
 444  ENDPROC(ret_fast_syscall)
 445
 446 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
 447 index 69bda1a5707e..1f665acaa6a9 100644
 448 --- a/arch/arm/kernel/patch.c
 449 +++ b/arch/arm/kernel/patch.c
 450 @@ -15,7 +15,7 @@ struct patch {
 451         unsigned int insn;
 452  };
 453
 454 -static DEFINE_SPINLOCK(patch_lock);
 455 +static DEFINE_RAW_SPINLOCK(patch_lock);
 456
 457  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 458         __acquires(&patch_lock)
 459 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 460                 return addr;
 461
 462         if (flags)
 463 -               spin_lock_irqsave(&patch_lock, *flags);
 464 +               raw_spin_lock_irqsave(&patch_lock, *flags);
 465         else
 466                 __acquire(&patch_lock);
 467
 468 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
 469         clear_fixmap(fixmap);
 470
 471         if (flags)
 472 -               spin_unlock_irqrestore(&patch_lock, *flags);
 473 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
 474         else
 475                 __release(&patch_lock);
 476  }
 477 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
 478 index 91d2d5b01414..750550098b59 100644
 479 --- a/arch/arm/kernel/process.c
 480 +++ b/arch/arm/kernel/process.c
 481 @@ -322,6 +322,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 482  }
 483
 484  #ifdef CONFIG_MMU
 485 +/*
 486 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 487 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 488 + * fail.
 489 + */
 490 +static int __init vectors_user_mapping_init_page(void)
 491 +{
 492 +       struct page *page;
 493 +       unsigned long addr = 0xffff0000;
 494 +       pgd_t *pgd;
 495 +       pud_t *pud;
 496 +       pmd_t *pmd;
 497 +
 498 +       pgd = pgd_offset_k(addr);
 499 +       pud = pud_offset(pgd, addr);
 500 +       pmd = pmd_offset(pud, addr);
 501 +       page = pmd_page(*(pmd));
 502 +
 503 +       pgtable_page_ctor(page);
 504 +
 505 +       return 0;
 506 +}
 507 +late_initcall(vectors_user_mapping_init_page);
 508 +
 509  #ifdef CONFIG_KUSER_HELPERS
 510  /*
 511   * The vectors page is always readable from user space for the
 512 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
 513 index 7b8f2141427b..96541e00b74a 100644
 514 --- a/arch/arm/kernel/signal.c
 515 +++ b/arch/arm/kernel/signal.c
 516 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 517          */
 518         trace_hardirqs_off();
 519         do {
 520 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 521 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 522 +                                          _TIF_NEED_RESCHED_LAZY))) {
 523                         schedule();
 524                 } else {
 525                         if (unlikely(!user_mode(regs)))
 526 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 527 index 7dd14e8395e6..4cd7e3d98035 100644
 528 --- a/arch/arm/kernel/smp.c
 529 +++ b/arch/arm/kernel/smp.c
 530 @@ -234,8 +234,6 @@ int __cpu_disable(void)
 531         flush_cache_louis();
 532         local_flush_tlb_all();
 533
 534 -       clear_tasks_mm_cpumask(cpu);
 535 -
 536         return 0;
 537  }
 538
 539 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
 540                 pr_err("CPU%u: cpu didn't die\n", cpu);
 541                 return;
 542         }
 543 +
 544 +       clear_tasks_mm_cpumask(cpu);
 545 +
 546         pr_notice("CPU%u: shutdown\n", cpu);
 547
 548         /*
 549 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
 550 index 0bee233fef9a..314cfb232a63 100644
 551 --- a/arch/arm/kernel/unwind.c
 552 +++ b/arch/arm/kernel/unwind.c
 553 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 554  static const struct unwind_idx *__origin_unwind_idx;
 555  extern const struct unwind_idx __stop_unwind_idx[];
 556
 557 -static DEFINE_SPINLOCK(unwind_lock);
 558 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 559  static LIST_HEAD(unwind_tables);
 560
 561  /* Convert a prel31 symbol to an absolute address */
 562 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 563                 /* module unwind tables */
 564                 struct unwind_table *table;
 565
 566 -               spin_lock_irqsave(&unwind_lock, flags);
 567 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 568                 list_for_each_entry(table, &unwind_tables, list) {
 569                         if (addr >= table->begin_addr &&
 570                             addr < table->end_addr) {
 571 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 572                                 break;
 573                         }
 574                 }
 575 -               spin_unlock_irqrestore(&unwind_lock, flags);
 576 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 577         }
 578
 579         pr_debug("%s: idx = %p\n", __func__, idx);
 580 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 581         tab->begin_addr = text_addr;
 582         tab->end_addr = text_addr + text_size;
 583
 584 -       spin_lock_irqsave(&unwind_lock, flags);
 585 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 586         list_add_tail(&tab->list, &unwind_tables);
 587 -       spin_unlock_irqrestore(&unwind_lock, flags);
 588 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 589
 590         return tab;
 591  }
 592 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 593         if (!tab)
 594                 return;
 595
 596 -       spin_lock_irqsave(&unwind_lock, flags);
 597 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 598         list_del(&tab->list);
 599 -       spin_unlock_irqrestore(&unwind_lock, flags);
 600 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 601
 602         kfree(tab);
 603  }
 604 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 605 index 19b5f5c1c0ff..82aa639e6737 100644
 606 --- a/arch/arm/kvm/arm.c
 607 +++ b/arch/arm/kvm/arm.c
 608 @@ -619,7 +619,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 609                  * involves poking the GIC, which must be done in a
 610                  * non-preemptible context.
 611                  */
 612 -               preempt_disable();
 613 +               migrate_disable();
 614                 kvm_pmu_flush_hwstate(vcpu);
 615                 kvm_timer_flush_hwstate(vcpu);
 616                 kvm_vgic_flush_hwstate(vcpu);
 617 @@ -640,7 +640,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 618                         kvm_pmu_sync_hwstate(vcpu);
 619                         kvm_timer_sync_hwstate(vcpu);
 620                         kvm_vgic_sync_hwstate(vcpu);
 621 -                       preempt_enable();
 622 +                       migrate_enable();
 623                         continue;
 624                 }
 625
 626 @@ -696,7 +696,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 627
 628                 kvm_vgic_sync_hwstate(vcpu);
 629
 630 -               preempt_enable();
 631 +               migrate_enable();
 632
 633                 ret = handle_exit(vcpu, run, ret);
 634         }
 635 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
 636 index 98ffe1e62ad5..df9769ddece5 100644
 637 --- a/arch/arm/mach-exynos/platsmp.c
 638 +++ b/arch/arm/mach-exynos/platsmp.c
 639 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
 640         return (void __iomem *)(S5P_VA_SCU);
 641  }
 642
 643 -static DEFINE_SPINLOCK(boot_lock);
 644 +static DEFINE_RAW_SPINLOCK(boot_lock);
 645
 646  static void exynos_secondary_init(unsigned int cpu)
 647  {
 648 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
 649         /*
 650          * Synchronise with the boot thread.
 651          */
 652 -       spin_lock(&boot_lock);
 653 -       spin_unlock(&boot_lock);
 654 +       raw_spin_lock(&boot_lock);
 655 +       raw_spin_unlock(&boot_lock);
 656  }
 657
 658  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 659 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 660          * Set synchronisation state between this boot processor
 661          * and the secondary one
 662          */
 663 -       spin_lock(&boot_lock);
 664 +       raw_spin_lock(&boot_lock);
 665
 666         /*
 667          * The secondary processor is waiting to be released from
 668 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 669
 670                 if (timeout == 0) {
 671                         printk(KERN_ERR "cpu1 power enable failed");
 672 -                       spin_unlock(&boot_lock);
 673 +                       raw_spin_unlock(&boot_lock);
 674                         return -ETIMEDOUT;
 675                 }
 676         }
 677 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 678          * calibrations, then wait for it to finish
 679          */
 680  fail:
 681 -       spin_unlock(&boot_lock);
 682 +       raw_spin_unlock(&boot_lock);
 683
 684         return pen_release != -1 ? ret : 0;
 685  }
 686 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
 687 index 4b653a8cb75c..b03d5a922cb1 100644
 688 --- a/arch/arm/mach-hisi/platmcpm.c
 689 +++ b/arch/arm/mach-hisi/platmcpm.c
 690 @@ -61,7 +61,7 @@
 691
 692  static void __iomem *sysctrl, *fabric;
 693  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
 694 -static DEFINE_SPINLOCK(boot_lock);
 695 +static DEFINE_RAW_SPINLOCK(boot_lock);
 696  static u32 fabric_phys_addr;
 697  /*
 698   * [0]: bootwrapper physical address
 699 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 700         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
 701                 return -EINVAL;
 702
 703 -       spin_lock_irq(&boot_lock);
 704 +       raw_spin_lock_irq(&boot_lock);
 705
 706         if (hip04_cpu_table[cluster][cpu])
 707                 goto out;
 708 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 709
 710  out:
 711         hip04_cpu_table[cluster][cpu]++;
 712 -       spin_unlock_irq(&boot_lock);
 713 +       raw_spin_unlock_irq(&boot_lock);
 714
 715         return 0;
 716  }
 717 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
 718         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 719         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 720
 721 -       spin_lock(&boot_lock);
 722 +       raw_spin_lock(&boot_lock);
 723         hip04_cpu_table[cluster][cpu]--;
 724         if (hip04_cpu_table[cluster][cpu] == 1) {
 725                 /* A power_up request went ahead of us. */
 726 -               spin_unlock(&boot_lock);
 727 +               raw_spin_unlock(&boot_lock);
 728                 return;
 729         } else if (hip04_cpu_table[cluster][cpu] > 1) {
 730                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
 731 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
 732         }
 733
 734         last_man = hip04_cluster_is_down(cluster);
 735 -       spin_unlock(&boot_lock);
 736 +       raw_spin_unlock(&boot_lock);
 737         if (last_man) {
 738                 /* Since it's Cortex A15, disable L2 prefetching. */
 739                 asm volatile(
 740 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 741                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 742
 743         count = TIMEOUT_MSEC / POLL_MSEC;
 744 -       spin_lock_irq(&boot_lock);
 745 +       raw_spin_lock_irq(&boot_lock);
 746         for (tries = 0; tries < count; tries++) {
 747                 if (hip04_cpu_table[cluster][cpu])
 748                         goto err;
 749 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 750                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 751                 if (data & CORE_WFI_STATUS(cpu))
 752                         break;
 753 -               spin_unlock_irq(&boot_lock);
 754 +               raw_spin_unlock_irq(&boot_lock);
 755                 /* Wait for clean L2 when the whole cluster is down. */
 756                 msleep(POLL_MSEC);
 757 -               spin_lock_irq(&boot_lock);
 758 +               raw_spin_lock_irq(&boot_lock);
 759         }
 760         if (tries >= count)
 761                 goto err;
 762 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 763                 goto err;
 764         if (hip04_cluster_is_down(cluster))
 765                 hip04_set_snoop_filter(cluster, 0);
 766 -       spin_unlock_irq(&boot_lock);
 767 +       raw_spin_unlock_irq(&boot_lock);
 768         return 1;
 769  err:
 770 -       spin_unlock_irq(&boot_lock);
 771 +       raw_spin_unlock_irq(&boot_lock);
 772         return 0;
 773  }
 774  #endif
 775 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
 776 index b4de3da6dffa..b52893319d75 100644
 777 --- a/arch/arm/mach-omap2/omap-smp.c
 778 +++ b/arch/arm/mach-omap2/omap-smp.c
 779 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
 780         .startup_addr = omap5_secondary_startup,
 781  };
 782
 783 -static DEFINE_SPINLOCK(boot_lock);
 784 +static DEFINE_RAW_SPINLOCK(boot_lock);
 785
 786  void __iomem *omap4_get_scu_base(void)
 787  {
 788 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
 789         /*
 790          * Synchronise with the boot thread.
 791          */
 792 -       spin_lock(&boot_lock);
 793 -       spin_unlock(&boot_lock);
 794 +       raw_spin_lock(&boot_lock);
 795 +       raw_spin_unlock(&boot_lock);
 796  }
 797
 798  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 799 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 800          * Set synchronisation state between this boot processor
 801          * and the secondary one
 802          */
 803 -       spin_lock(&boot_lock);
 804 +       raw_spin_lock(&boot_lock);
 805
 806         /*
 807          * Update the AuxCoreBoot0 with boot state for secondary core.
 808 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 809          * Now the secondary core is starting up let it run its
 810          * calibrations, then wait for it to finish
 811          */
 812 -       spin_unlock(&boot_lock);
 813 +       raw_spin_unlock(&boot_lock);
 814
 815         return 0;
 816  }
 817 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
 818 index 0875b99add18..18b6d98d2581 100644
 819 --- a/arch/arm/mach-prima2/platsmp.c
 820 +++ b/arch/arm/mach-prima2/platsmp.c
 821 @@ -22,7 +22,7 @@
 822
 823  static void __iomem *clk_base;
 824
 825 -static DEFINE_SPINLOCK(boot_lock);
 826 +static DEFINE_RAW_SPINLOCK(boot_lock);
 827
 828  static void sirfsoc_secondary_init(unsigned int cpu)
 829  {
 830 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
 831         /*
 832          * Synchronise with the boot thread.
 833          */
 834 -       spin_lock(&boot_lock);
 835 -       spin_unlock(&boot_lock);
 836 +       raw_spin_lock(&boot_lock);
 837 +       raw_spin_unlock(&boot_lock);
 838  }
 839
 840  static const struct of_device_id clk_ids[]  = {
 841 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 842         /* make sure write buffer is drained */
 843         mb();
 844
 845 -       spin_lock(&boot_lock);
 846 +       raw_spin_lock(&boot_lock);
 847
 848         /*
 849          * The secondary processor is waiting to be released from
 850 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 851          * now the secondary core is starting up let it run its
 852          * calibrations, then wait for it to finish
 853          */
 854 -       spin_unlock(&boot_lock);
 855 +       raw_spin_unlock(&boot_lock);
 856
 857         return pen_release != -1 ? -ENOSYS : 0;
 858  }
 859 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
 860 index 5494c9e0c909..e8ce157d3548 100644
 861 --- a/arch/arm/mach-qcom/platsmp.c
 862 +++ b/arch/arm/mach-qcom/platsmp.c
 863 @@ -46,7 +46,7 @@
 864
 865  extern void secondary_startup_arm(void);
 866
 867 -static DEFINE_SPINLOCK(boot_lock);
 868 +static DEFINE_RAW_SPINLOCK(boot_lock);
 869
 870  #ifdef CONFIG_HOTPLUG_CPU
 871  static void qcom_cpu_die(unsigned int cpu)
 872 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
 873         /*
 874          * Synchronise with the boot thread.
 875          */
 876 -       spin_lock(&boot_lock);
 877 -       spin_unlock(&boot_lock);
 878 +       raw_spin_lock(&boot_lock);
 879 +       raw_spin_unlock(&boot_lock);
 880  }
 881
 882  static int scss_release_secondary(unsigned int cpu)
 883 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 884          * set synchronisation state between this boot processor
 885          * and the secondary one
 886          */
 887 -       spin_lock(&boot_lock);
 888 +       raw_spin_lock(&boot_lock);
 889
 890         /*
 891          * Send the secondary CPU a soft interrupt, thereby causing
 892 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 893          * now the secondary core is starting up let it run its
 894          * calibrations, then wait for it to finish
 895          */
 896 -       spin_unlock(&boot_lock);
 897 +       raw_spin_unlock(&boot_lock);
 898
 899         return ret;
 900  }
 901 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
 902 index 8d1e2d551786..7fa56cc78118 100644
 903 --- a/arch/arm/mach-spear/platsmp.c
 904 +++ b/arch/arm/mach-spear/platsmp.c
 905 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
 906         sync_cache_w(&pen_release);
 907  }
 908
 909 -static DEFINE_SPINLOCK(boot_lock);
 910 +static DEFINE_RAW_SPINLOCK(boot_lock);
 911
 912  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 913
 914 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
 915         /*
 916          * Synchronise with the boot thread.
 917          */
 918 -       spin_lock(&boot_lock);
 919 -       spin_unlock(&boot_lock);
 920 +       raw_spin_lock(&boot_lock);
 921 +       raw_spin_unlock(&boot_lock);
 922  }
 923
 924  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 925 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 926          * set synchronisation state between this boot processor
 927          * and the secondary one
 928          */
 929 -       spin_lock(&boot_lock);
 930 +       raw_spin_lock(&boot_lock);
 931
 932         /*
 933          * The secondary processor is waiting to be released from
 934 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 935          * now the secondary core is starting up let it run its
 936          * calibrations, then wait for it to finish
 937          */
 938 -       spin_unlock(&boot_lock);
 939 +       raw_spin_unlock(&boot_lock);
 940
 941         return pen_release != -1 ? -ENOSYS : 0;
 942  }
 943 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
 944 index ea5a2277ee46..b988e081ac79 100644
 945 --- a/arch/arm/mach-sti/platsmp.c
 946 +++ b/arch/arm/mach-sti/platsmp.c
 947 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
 948         sync_cache_w(&pen_release);
 949  }
 950
 951 -static DEFINE_SPINLOCK(boot_lock);
 952 +static DEFINE_RAW_SPINLOCK(boot_lock);
 953
 954  static void sti_secondary_init(unsigned int cpu)
 955  {
 956 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
 957         /*
 958          * Synchronise with the boot thread.
 959          */
 960 -       spin_lock(&boot_lock);
 961 -       spin_unlock(&boot_lock);
 962 +       raw_spin_lock(&boot_lock);
 963 +       raw_spin_unlock(&boot_lock);
 964  }
 965
 966  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 967 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 968          * set synchronisation state between this boot processor
 969          * and the secondary one
 970          */
 971 -       spin_lock(&boot_lock);
 972 +       raw_spin_lock(&boot_lock);
 973
 974         /*
 975          * The secondary processor is waiting to be released from
 976 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 977          * now the secondary core is starting up let it run its
 978          * calibrations, then wait for it to finish
 979          */
 980 -       spin_unlock(&boot_lock);
 981 +       raw_spin_unlock(&boot_lock);
 982
 983         return pen_release != -1 ? -ENOSYS : 0;
 984  }
 985 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
 986 index f7861dc83182..ce47dfe25fb0 100644
 987 --- a/arch/arm/mm/fault.c
 988 +++ b/arch/arm/mm/fault.c
 989 @@ -433,6 +433,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
 990         if (addr < TASK_SIZE)
 991                 return do_page_fault(addr, fsr, regs);
 992
 993 +       if (interrupts_enabled(regs))
 994 +               local_irq_enable();
 995 +
 996         if (user_mode(regs))
 997                 goto bad_area;
 998
 999 @@ -500,6 +503,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1000  static int
1001  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1002  {
1003 +       if (interrupts_enabled(regs))
1004 +               local_irq_enable();
1005 +
1006         do_bad_area(addr, fsr, regs);
1007         return 0;
1008  }
1009 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1010 index d02f8187b1cc..542692dbd40a 100644
1011 --- a/arch/arm/mm/highmem.c
1012 +++ b/arch/arm/mm/highmem.c
1013 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1014         return *ptep;
1015  }
1016
1017 +static unsigned int fixmap_idx(int type)
1018 +{
1019 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1020 +}
1021 +
1022  void *kmap(struct page *page)
1023  {
1024         might_sleep();
1025 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1026
1027  void *kmap_atomic(struct page *page)
1028  {
1029 +       pte_t pte = mk_pte(page, kmap_prot);
1030         unsigned int idx;
1031         unsigned long vaddr;
1032         void *kmap;
1033         int type;
1034
1035 -       preempt_disable();
1036 +       preempt_disable_nort();
1037         pagefault_disable();
1038         if (!PageHighMem(page))
1039                 return page_address(page);
1040 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1041
1042         type = kmap_atomic_idx_push();
1043
1044 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1045 +       idx = fixmap_idx(type);
1046         vaddr = __fix_to_virt(idx);
1047  #ifdef CONFIG_DEBUG_HIGHMEM
1048         /*
1049 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1050          * in place, so the contained TLB flush ensures the TLB is updated
1051          * with the new mapping.
1052          */
1053 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1054 +#ifdef CONFIG_PREEMPT_RT_FULL
1055 +       current->kmap_pte[type] = pte;
1056 +#endif
1057 +       set_fixmap_pte(idx, pte);
1058
1059         return (void *)vaddr;
1060  }
1061 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1062
1063         if (kvaddr >= (void *)FIXADDR_START) {
1064                 type = kmap_atomic_idx();
1065 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1066 +               idx = fixmap_idx(type);
1067
1068                 if (cache_is_vivt())
1069                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1070 +#ifdef CONFIG_PREEMPT_RT_FULL
1071 +               current->kmap_pte[type] = __pte(0);
1072 +#endif
1073  #ifdef CONFIG_DEBUG_HIGHMEM
1074                 BUG_ON(vaddr != __fix_to_virt(idx));
1075 -               set_fixmap_pte(idx, __pte(0));
1076  #else
1077                 (void) idx;  /* to kill a warning */
1078  #endif
1079 +               set_fixmap_pte(idx, __pte(0));
1080                 kmap_atomic_idx_pop();
1081         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1082                 /* this address was obtained through kmap_high_get() */
1083                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1084         }
1085         pagefault_enable();
1086 -       preempt_enable();
1087 +       preempt_enable_nort();
1088  }
1089  EXPORT_SYMBOL(__kunmap_atomic);
1090
1091  void *kmap_atomic_pfn(unsigned long pfn)
1092  {
1093 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1094         unsigned long vaddr;
1095         int idx, type;
1096         struct page *page = pfn_to_page(pfn);
1097
1098 -       preempt_disable();
1099 +       preempt_disable_nort();
1100         pagefault_disable();
1101         if (!PageHighMem(page))
1102                 return page_address(page);
1103
1104         type = kmap_atomic_idx_push();
1105 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1106 +       idx = fixmap_idx(type);
1107         vaddr = __fix_to_virt(idx);
1108  #ifdef CONFIG_DEBUG_HIGHMEM
1109         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1110  #endif
1111 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1112 +#ifdef CONFIG_PREEMPT_RT_FULL
1113 +       current->kmap_pte[type] = pte;
1114 +#endif
1115 +       set_fixmap_pte(idx, pte);
1116
1117         return (void *)vaddr;
1118  }
1119 +#if defined CONFIG_PREEMPT_RT_FULL
1120 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1121 +{
1122 +       int i;
1123 +
1124 +       /*
1125 +        * Clear @prev's kmap_atomic mappings
1126 +        */
1127 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1128 +               int idx = fixmap_idx(i);
1129 +
1130 +               set_fixmap_pte(idx, __pte(0));
1131 +       }
1132 +       /*
1133 +        * Restore @next_p's kmap_atomic mappings
1134 +        */
1135 +       for (i = 0; i < next_p->kmap_idx; i++) {
1136 +               int idx = fixmap_idx(i);
1137 +
1138 +               if (!pte_none(next_p->kmap_pte[i]))
1139 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1140 +       }
1141 +}
1142 +#endif
1143 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1144 index c2366510187a..6b60f582b738 100644
1145 --- a/arch/arm/plat-versatile/platsmp.c
1146 +++ b/arch/arm/plat-versatile/platsmp.c
1147 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1148         sync_cache_w(&pen_release);
1149  }
1150
1151 -static DEFINE_SPINLOCK(boot_lock);
1152 +static DEFINE_RAW_SPINLOCK(boot_lock);
1153
1154  void versatile_secondary_init(unsigned int cpu)
1155  {
1156 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1157         /*
1158          * Synchronise with the boot thread.
1159          */
1160 -       spin_lock(&boot_lock);
1161 -       spin_unlock(&boot_lock);
1162 +       raw_spin_lock(&boot_lock);
1163 +       raw_spin_unlock(&boot_lock);
1164  }
1165
1166  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1167 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1168          * Set synchronisation state between this boot processor
1169          * and the secondary one
1170          */
1171 -       spin_lock(&boot_lock);
1172 +       raw_spin_lock(&boot_lock);
1173
1174         /*
1175          * This is really belt and braces; we hold unintended secondary
1176 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1177          * now the secondary core is starting up let it run its
1178          * calibrations, then wait for it to finish
1179          */
1180 -       spin_unlock(&boot_lock);
1181 +       raw_spin_unlock(&boot_lock);
1182
1183         return pen_release != -1 ? -ENOSYS : 0;
1184  }
1185 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1186 index cf57a7799a0f..78d1b49fbed5 100644
1187 --- a/arch/arm64/Kconfig
1188 +++ b/arch/arm64/Kconfig
1189 @@ -91,6 +91,7 @@ config ARM64
1190         select HAVE_PERF_EVENTS
1191         select HAVE_PERF_REGS
1192         select HAVE_PERF_USER_STACK_DUMP
1193 +       select HAVE_PREEMPT_LAZY
1194         select HAVE_REGS_AND_STACK_ACCESS_API
1195         select HAVE_RCU_TABLE_FREE
1196         select HAVE_SYSCALL_TRACEPOINTS
1197 @@ -704,7 +705,7 @@ config XEN_DOM0
1198
1199  config XEN
1200         bool "Xen guest support on ARM64"
1201 -       depends on ARM64 && OF
1202 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1203         select SWIOTLB_XEN
1204         select PARAVIRT
1205         help
1206 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1207 index e9ea5a6bd449..6c500ad63c6a 100644
1208 --- a/arch/arm64/include/asm/thread_info.h
1209 +++ b/arch/arm64/include/asm/thread_info.h
1210 @@ -49,6 +49,7 @@ struct thread_info {
1211         mm_segment_t            addr_limit;     /* address limit */
1212         struct task_struct      *task;          /* main task structure */
1213         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1214 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1215         int                     cpu;            /* cpu */
1216  };
1217
1218 @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
1219  #define TIF_NEED_RESCHED       1
1220  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1221  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1222 +#define TIF_NEED_RESCHED_LAZY  4
1223  #define TIF_NOHZ               7
1224  #define TIF_SYSCALL_TRACE      8
1225  #define TIF_SYSCALL_AUDIT      9
1226 @@ -127,6 +129,7 @@ static inline struct thread_info *current_thread_info(void)
1227  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1228  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1229  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1230 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1231  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1232  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1233  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1234 @@ -135,7 +138,9 @@ static inline struct thread_info *current_thread_info(void)
1235  #define _TIF_32BIT             (1 << TIF_32BIT)
1236
1237  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1238 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1239 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1240 +                                _TIF_NEED_RESCHED_LAZY)
1241 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1242
1243  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1244                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1245 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1246 index c58ddf8c4062..a8f2f7c1fe12 100644
1247 --- a/arch/arm64/kernel/asm-offsets.c
1248 +++ b/arch/arm64/kernel/asm-offsets.c
1249 @@ -38,6 +38,7 @@ int main(void)
1250    BLANK();
1251    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1252    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1253 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1254    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1255    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1256    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1257 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1258 index b4c7db434654..433d846f4f51 100644
1259 --- a/arch/arm64/kernel/entry.S
1260 +++ b/arch/arm64/kernel/entry.S
1261 @@ -430,11 +430,16 @@ el1_irq:
1262
1263  #ifdef CONFIG_PREEMPT
1264         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1265 -       cbnz    w24, 1f                         // preempt count != 0
1266 +       cbnz    w24, 2f                         // preempt count != 0
1267         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1268 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1269 -       bl      el1_preempt
1270 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1271 +
1272 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1273 +       cbnz    w24, 2f                         // preempt lazy count != 0
1274 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1275  1:
1276 +       bl      el1_preempt
1277 +2:
1278  #endif
1279  #ifdef CONFIG_TRACE_IRQFLAGS
1280         bl      trace_hardirqs_on
1281 @@ -448,6 +453,7 @@ el1_preempt:
1282  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1283         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1284         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1285 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1286         ret     x24
1287  #endif
1288
1289 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
1290 index 404dd67080b9..639dc6d12e72 100644
1291 --- a/arch/arm64/kernel/signal.c
1292 +++ b/arch/arm64/kernel/signal.c
1293 @@ -409,7 +409,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
1294          */
1295         trace_hardirqs_off();
1296         do {
1297 -               if (thread_flags & _TIF_NEED_RESCHED) {
1298 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1299                         schedule();
1300                 } else {
1301                         local_irq_enable();
1302 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1303 index 5e844f68e847..dc613cc10f54 100644
1304 --- a/arch/mips/Kconfig
1305 +++ b/arch/mips/Kconfig
1306 @@ -2516,7 +2516,7 @@ config MIPS_ASID_BITS_VARIABLE
1307  #
1308  config HIGHMEM
1309         bool "High Memory Support"
1310 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1311 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1312
1313  config CPU_SUPPORTS_HIGHMEM
1314         bool
1315 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1316 index 6eda5abbd719..601e27701a4a 100644
1317 --- a/arch/powerpc/Kconfig
1318 +++ b/arch/powerpc/Kconfig
1319 @@ -52,10 +52,11 @@ config LOCKDEP_SUPPORT
1320
1321  config RWSEM_GENERIC_SPINLOCK
1322         bool
1323 +       default y if PREEMPT_RT_FULL
1324
1325  config RWSEM_XCHGADD_ALGORITHM
1326         bool
1327 -       default y
1328 +       default y if !PREEMPT_RT_FULL
1329
1330  config GENERIC_LOCKBREAK
1331         bool
1332 @@ -134,6 +135,7 @@ config PPC
1333         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1334         select GENERIC_STRNCPY_FROM_USER
1335         select GENERIC_STRNLEN_USER
1336 +       select HAVE_PREEMPT_LAZY
1337         select HAVE_MOD_ARCH_SPECIFIC
1338         select MODULES_USE_ELF_RELA
1339         select CLONE_BACKWARDS
1340 @@ -321,7 +323,7 @@ menu "Kernel options"
1341
1342  config HIGHMEM
1343         bool "High memory support"
1344 -       depends on PPC32
1345 +       depends on PPC32 && !PREEMPT_RT_FULL
1346
1347  source kernel/Kconfig.hz
1348  source kernel/Kconfig.preempt
1349 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1350 index 87e4b2d8dcd4..981e501a4359 100644
1351 --- a/arch/powerpc/include/asm/thread_info.h
1352 +++ b/arch/powerpc/include/asm/thread_info.h
1353 @@ -43,6 +43,8 @@ struct thread_info {
1354         int             cpu;                    /* cpu we're on */
1355         int             preempt_count;          /* 0 => preemptable,
1356                                                    <0 => BUG */
1357 +       int             preempt_lazy_count;     /* 0 => preemptable,
1358 +                                                  <0 => BUG */
1359         unsigned long   local_flags;            /* private flags for thread */
1360  #ifdef CONFIG_LIVEPATCH
1361         unsigned long *livepatch_sp;
1362 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1363  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1364  #define TIF_SIGPENDING         1       /* signal pending */
1365  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1366 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1367 -                                          TIF_NEED_RESCHED */
1368 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1369  #define TIF_32BIT              4       /* 32 bit binary */
1370  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1371  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1372 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1373  #if defined(CONFIG_PPC64)
1374  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1375  #endif
1376 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1377 +                                          TIF_NEED_RESCHED */
1378
1379  /* as above, but as bit values */
1380  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1381 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1382  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1383  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1384  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1385 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1386  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1387                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1388                                  _TIF_NOHZ)
1389
1390  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1391                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1392 -                                _TIF_RESTORE_TM)
1393 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1394  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1395 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1396
1397  /* Bits in local_flags */
1398  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1399 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1400 index c833d88c423d..96e9fbc3f684 100644
1401 --- a/arch/powerpc/kernel/asm-offsets.c
1402 +++ b/arch/powerpc/kernel/asm-offsets.c
1403 @@ -156,6 +156,7 @@ int main(void)
1404         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1405         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1406         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1407 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1408         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1409         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1410
1411 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1412 index 3841d749a430..6dbaeff192b9 100644
1413 --- a/arch/powerpc/kernel/entry_32.S
1414 +++ b/arch/powerpc/kernel/entry_32.S
1415 @@ -835,7 +835,14 @@ resume_kernel:
1416         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1417         bne     restore
1418         andi.   r8,r8,_TIF_NEED_RESCHED
1419 +       bne+    1f
1420 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1421 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1422 +       bne     restore
1423 +       lwz     r0,TI_FLAGS(r9)
1424 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1425         beq+    restore
1426 +1:
1427         lwz     r3,_MSR(r1)
1428         andi.   r0,r3,MSR_EE    /* interrupts off? */
1429         beq     restore         /* don't schedule if so */
1430 @@ -846,11 +853,11 @@ resume_kernel:
1431          */
1432         bl      trace_hardirqs_off
1433  #endif
1434 -1:     bl      preempt_schedule_irq
1435 +2:     bl      preempt_schedule_irq
1436         CURRENT_THREAD_INFO(r9, r1)
1437         lwz     r3,TI_FLAGS(r9)
1438 -       andi.   r0,r3,_TIF_NEED_RESCHED
1439 -       bne-    1b
1440 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1441 +       bne-    2b
1442  #ifdef CONFIG_TRACE_IRQFLAGS
1443         /* And now, to properly rebalance the above, we tell lockdep they
1444          * are being turned back on, which will happen when we return
1445 @@ -1171,7 +1178,7 @@ global_dbcr0:
1446  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1447
1448  do_work:                       /* r10 contains MSR_KERNEL here */
1449 -       andi.   r0,r9,_TIF_NEED_RESCHED
1450 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1451         beq     do_user_signal
1452
1453  do_resched:                    /* r10 contains MSR_KERNEL here */
1454 @@ -1192,7 +1199,7 @@ recheck:
1455         MTMSRD(r10)             /* disable interrupts */
1456         CURRENT_THREAD_INFO(r9, r1)
1457         lwz     r9,TI_FLAGS(r9)
1458 -       andi.   r0,r9,_TIF_NEED_RESCHED
1459 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1460         bne-    do_resched
1461         andi.   r0,r9,_TIF_USER_WORK_MASK
1462         beq     restore_user
1463 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1464 index caa659671599..891080c4a41e 100644
1465 --- a/arch/powerpc/kernel/entry_64.S
1466 +++ b/arch/powerpc/kernel/entry_64.S
1467 @@ -656,7 +656,7 @@ _GLOBAL(ret_from_except_lite)
1468         bl      restore_math
1469         b       restore
1470  #endif
1471 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1472 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1473         beq     2f
1474         bl      restore_interrupts
1475         SCHEDULE_USER
1476 @@ -718,10 +718,18 @@ resume_kernel:
1477
1478  #ifdef CONFIG_PREEMPT
1479         /* Check if we need to preempt */
1480 +       lwz     r8,TI_PREEMPT(r9)
1481 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1482 +       bne     restore
1483         andi.   r0,r4,_TIF_NEED_RESCHED
1484 +       bne+    check_count
1485 +
1486 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1487         beq+    restore
1488 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1489 +
1490         /* Check that preempt_count() == 0 and interrupts are enabled */
1491 -       lwz     r8,TI_PREEMPT(r9)
1492 +check_count:
1493         cmpwi   cr1,r8,0
1494         ld      r0,SOFTE(r1)
1495         cmpdi   r0,0
1496 @@ -738,7 +746,7 @@ resume_kernel:
1497         /* Re-test flags and eventually loop */
1498         CURRENT_THREAD_INFO(r9, r1)
1499         ld      r4,TI_FLAGS(r9)
1500 -       andi.   r0,r4,_TIF_NEED_RESCHED
1501 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1502         bne     1b
1503
1504         /*
1505 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1506 index 028a22bfa90c..a75e2dd3e71f 100644
1507 --- a/arch/powerpc/kernel/irq.c
1508 +++ b/arch/powerpc/kernel/irq.c
1509 @@ -651,6 +651,7 @@ void irq_ctx_init(void)
1510         }
1511  }
1512
1513 +#ifndef CONFIG_PREEMPT_RT_FULL
1514  void do_softirq_own_stack(void)
1515  {
1516         struct thread_info *curtp, *irqtp;
1517 @@ -668,6 +669,7 @@ void do_softirq_own_stack(void)
1518         if (irqtp->flags)
1519                 set_bits(irqtp->flags, &curtp->flags);
1520  }
1521 +#endif
1522
1523  irq_hw_number_t virq_to_hw(unsigned int virq)
1524  {
1525 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1526 index 030d72df5dd5..b471a709e100 100644
1527 --- a/arch/powerpc/kernel/misc_32.S
1528 +++ b/arch/powerpc/kernel/misc_32.S
1529 @@ -41,6 +41,7 @@
1530   * We store the saved ksp_limit in the unused part
1531   * of the STACK_FRAME_OVERHEAD
1532   */
1533 +#ifndef CONFIG_PREEMPT_RT_FULL
1534  _GLOBAL(call_do_softirq)
1535         mflr    r0
1536         stw     r0,4(r1)
1537 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
1538         stw     r10,THREAD+KSP_LIMIT(r2)
1539         mtlr    r0
1540         blr
1541 +#endif
1542
1543  /*
1544   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1545 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1546 index 4cefe6888b18..cb2ee4be999a 100644
1547 --- a/arch/powerpc/kernel/misc_64.S
1548 +++ b/arch/powerpc/kernel/misc_64.S
1549 @@ -31,6 +31,7 @@
1550
1551         .text
1552
1553 +#ifndef CONFIG_PREEMPT_RT_FULL
1554  _GLOBAL(call_do_softirq)
1555         mflr    r0
1556         std     r0,16(r1)
1557 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
1558         ld      r0,16(r1)
1559         mtlr    r0
1560         blr
1561 +#endif
1562
1563  _GLOBAL(call_do_irq)
1564         mflr    r0
1565 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1566 index 029be26b5a17..9528089ea142 100644
1567 --- a/arch/powerpc/kvm/Kconfig
1568 +++ b/arch/powerpc/kvm/Kconfig
1569 @@ -175,6 +175,7 @@ config KVM_E500MC
1570  config KVM_MPIC
1571         bool "KVM in-kernel MPIC emulation"
1572         depends on KVM && E500
1573 +       depends on !PREEMPT_RT_FULL
1574         select HAVE_KVM_IRQCHIP
1575         select HAVE_KVM_IRQFD
1576         select HAVE_KVM_IRQ_ROUTING
1577 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1578 index e48462447ff0..2670cee66064 100644
1579 --- a/arch/powerpc/platforms/ps3/device-init.c
1580 +++ b/arch/powerpc/platforms/ps3/device-init.c
1581 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1582         }
1583         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1584
1585 -       res = wait_event_interruptible(dev->done.wait,
1586 +       res = swait_event_interruptible(dev->done.wait,
1587                                        dev->done.done || kthread_should_stop());
1588         if (kthread_should_stop())
1589                 res = -EINTR;
1590 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1591 index 6c0378c0b8b5..abd58b4dff97 100644
1592 --- a/arch/sh/kernel/irq.c
1593 +++ b/arch/sh/kernel/irq.c
1594 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1595         hardirq_ctx[cpu] = NULL;
1596  }
1597
1598 +#ifndef CONFIG_PREEMPT_RT_FULL
1599  void do_softirq_own_stack(void)
1600  {
1601         struct thread_info *curctx;
1602 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1603                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1604         );
1605  }
1606 +#endif
1607  #else
1608  static inline void handle_one_irq(unsigned int irq)
1609  {
1610 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1611 index 8b4152f3a764..c5cca159692a 100644
1612 --- a/arch/sparc/Kconfig
1613 +++ b/arch/sparc/Kconfig
1614 @@ -194,12 +194,10 @@ config NR_CPUS
1615  source kernel/Kconfig.hz
1616
1617  config RWSEM_GENERIC_SPINLOCK
1618 -       bool
1619 -       default y if SPARC32
1620 +       def_bool PREEMPT_RT_FULL
1621
1622  config RWSEM_XCHGADD_ALGORITHM
1623 -       bool
1624 -       default y if SPARC64
1625 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1626
1627  config GENERIC_HWEIGHT
1628         bool
1629 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1630 index 5cbf03c14981..6067d9379e5b 100644
1631 --- a/arch/sparc/kernel/irq_64.c
1632 +++ b/arch/sparc/kernel/irq_64.c
1633 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1634         set_irq_regs(old_regs);
1635  }
1636
1637 +#ifndef CONFIG_PREEMPT_RT_FULL
1638  void do_softirq_own_stack(void)
1639  {
1640         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1641 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1642         __asm__ __volatile__("mov %0, %%sp"
1643                              : : "r" (orig_sp));
1644  }
1645 +#endif
1646
1647  #ifdef CONFIG_HOTPLUG_CPU
1648  void fixup_irqs(void)
1649 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1650 index b9c546a305a4..e96c2975af4f 100644
1651 --- a/arch/x86/Kconfig
1652 +++ b/arch/x86/Kconfig
1653 @@ -17,6 +17,7 @@ config X86_64
1654  ### Arch settings
1655  config X86
1656         def_bool y
1657 +       select HAVE_PREEMPT_LAZY
1658         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1659         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1660         select ANON_INODES
1661 @@ -232,8 +233,11 @@ config ARCH_MAY_HAVE_PC_FDC
1662         def_bool y
1663         depends on ISA_DMA_API
1664
1665 +config RWSEM_GENERIC_SPINLOCK
1666 +       def_bool PREEMPT_RT_FULL
1667 +
1668  config RWSEM_XCHGADD_ALGORITHM
1669 -       def_bool y
1670 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1671
1672  config GENERIC_CALIBRATE_DELAY
1673         def_bool y
1674 @@ -897,7 +901,7 @@ config IOMMU_HELPER
1675  config MAXSMP
1676         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1677         depends on X86_64 && SMP && DEBUG_KERNEL
1678 -       select CPUMASK_OFFSTACK
1679 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1680         ---help---
1681           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1682           If unsure, say N.
1683 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1684 index aa8b0672f87a..2429414bfc71 100644
1685 --- a/arch/x86/crypto/aesni-intel_glue.c
1686 +++ b/arch/x86/crypto/aesni-intel_glue.c
1687 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1688         err = blkcipher_walk_virt(desc, &walk);
1689         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1690
1691 -       kernel_fpu_begin();
1692         while ((nbytes = walk.nbytes)) {
1693 +               kernel_fpu_begin();
1694                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1695 -                             nbytes & AES_BLOCK_MASK);
1696 +                               nbytes & AES_BLOCK_MASK);
1697 +               kernel_fpu_end();
1698                 nbytes &= AES_BLOCK_SIZE - 1;
1699                 err = blkcipher_walk_done(desc, &walk, nbytes);
1700         }
1701 -       kernel_fpu_end();
1702
1703         return err;
1704  }
1705 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1706         err = blkcipher_walk_virt(desc, &walk);
1707         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1708
1709 -       kernel_fpu_begin();
1710         while ((nbytes = walk.nbytes)) {
1711 +               kernel_fpu_begin();
1712                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1713                               nbytes & AES_BLOCK_MASK);
1714 +               kernel_fpu_end();
1715                 nbytes &= AES_BLOCK_SIZE - 1;
1716                 err = blkcipher_walk_done(desc, &walk, nbytes);
1717         }
1718 -       kernel_fpu_end();
1719
1720         return err;
1721  }
1722 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1723         err = blkcipher_walk_virt(desc, &walk);
1724         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1725
1726 -       kernel_fpu_begin();
1727         while ((nbytes = walk.nbytes)) {
1728 +               kernel_fpu_begin();
1729                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1730                               nbytes & AES_BLOCK_MASK, walk.iv);
1731 +               kernel_fpu_end();
1732                 nbytes &= AES_BLOCK_SIZE - 1;
1733                 err = blkcipher_walk_done(desc, &walk, nbytes);
1734         }
1735 -       kernel_fpu_end();
1736
1737         return err;
1738  }
1739 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1740         err = blkcipher_walk_virt(desc, &walk);
1741         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1742
1743 -       kernel_fpu_begin();
1744         while ((nbytes = walk.nbytes)) {
1745 +               kernel_fpu_begin();
1746                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1747                               nbytes & AES_BLOCK_MASK, walk.iv);
1748 +               kernel_fpu_end();
1749                 nbytes &= AES_BLOCK_SIZE - 1;
1750                 err = blkcipher_walk_done(desc, &walk, nbytes);
1751         }
1752 -       kernel_fpu_end();
1753
1754         return err;
1755  }
1756 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1757         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1758         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1759
1760 -       kernel_fpu_begin();
1761         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1762 +               kernel_fpu_begin();
1763                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1764                                       nbytes & AES_BLOCK_MASK, walk.iv);
1765 +               kernel_fpu_end();
1766                 nbytes &= AES_BLOCK_SIZE - 1;
1767                 err = blkcipher_walk_done(desc, &walk, nbytes);
1768         }
1769         if (walk.nbytes) {
1770 +               kernel_fpu_begin();
1771                 ctr_crypt_final(ctx, &walk);
1772 +               kernel_fpu_end();
1773                 err = blkcipher_walk_done(desc, &walk, 0);
1774         }
1775 -       kernel_fpu_end();
1776
1777         return err;
1778  }
1779 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1780 index 8648158f3916..d7699130ee36 100644
1781 --- a/arch/x86/crypto/cast5_avx_glue.c
1782 +++ b/arch/x86/crypto/cast5_avx_glue.c
1783 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1784  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1785                      bool enc)
1786  {
1787 -       bool fpu_enabled = false;
1788 +       bool fpu_enabled;
1789         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1790         const unsigned int bsize = CAST5_BLOCK_SIZE;
1791         unsigned int nbytes;
1792 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1793                 u8 *wsrc = walk->src.virt.addr;
1794                 u8 *wdst = walk->dst.virt.addr;
1795
1796 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1797 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1798
1799                 /* Process multi-block batch */
1800                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1801 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1802                 } while (nbytes >= bsize);
1803
1804  done:
1805 +               cast5_fpu_end(fpu_enabled);
1806                 err = blkcipher_walk_done(desc, walk, nbytes);
1807         }
1808 -
1809 -       cast5_fpu_end(fpu_enabled);
1810         return err;
1811  }
1812
1813 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1814  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1815                        struct scatterlist *src, unsigned int nbytes)
1816  {
1817 -       bool fpu_enabled = false;
1818 +       bool fpu_enabled;
1819         struct blkcipher_walk walk;
1820         int err;
1821
1822 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1823         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1824
1825         while ((nbytes = walk.nbytes)) {
1826 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1827 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1828                 nbytes = __cbc_decrypt(desc, &walk);
1829 +               cast5_fpu_end(fpu_enabled);
1830                 err = blkcipher_walk_done(desc, &walk, nbytes);
1831         }
1832 -
1833 -       cast5_fpu_end(fpu_enabled);
1834         return err;
1835  }
1836
1837 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1838  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1839                      struct scatterlist *src, unsigned int nbytes)
1840  {
1841 -       bool fpu_enabled = false;
1842 +       bool fpu_enabled;
1843         struct blkcipher_walk walk;
1844         int err;
1845
1846 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1847         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1848
1849         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1850 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1851 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1852                 nbytes = __ctr_crypt(desc, &walk);
1853 +               cast5_fpu_end(fpu_enabled);
1854                 err = blkcipher_walk_done(desc, &walk, nbytes);
1855         }
1856
1857 -       cast5_fpu_end(fpu_enabled);
1858 -
1859         if (walk.nbytes) {
1860                 ctr_crypt_final(desc, &walk);
1861                 err = blkcipher_walk_done(desc, &walk, 0);
1862 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1863 index 6a85598931b5..3a506ce7ed93 100644
1864 --- a/arch/x86/crypto/glue_helper.c
1865 +++ b/arch/x86/crypto/glue_helper.c
1866 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1867         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1868         const unsigned int bsize = 128 / 8;
1869         unsigned int nbytes, i, func_bytes;
1870 -       bool fpu_enabled = false;
1871 +       bool fpu_enabled;
1872         int err;
1873
1874         err = blkcipher_walk_virt(desc, walk);
1875 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1876                 u8 *wdst = walk->dst.virt.addr;
1877
1878                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1879 -                                            desc, fpu_enabled, nbytes);
1880 +                                            desc, false, nbytes);
1881
1882                 for (i = 0; i < gctx->num_funcs; i++) {
1883                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1884 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1885                 }
1886
1887  done:
1888 +               glue_fpu_end(fpu_enabled);
1889                 err = blkcipher_walk_done(desc, walk, nbytes);
1890         }
1891
1892 -       glue_fpu_end(fpu_enabled);
1893         return err;
1894  }
1895
1896 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1897                             struct scatterlist *src, unsigned int nbytes)
1898  {
1899         const unsigned int bsize = 128 / 8;
1900 -       bool fpu_enabled = false;
1901 +       bool fpu_enabled;
1902         struct blkcipher_walk walk;
1903         int err;
1904
1905 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1906
1907         while ((nbytes = walk.nbytes)) {
1908                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1909 -                                            desc, fpu_enabled, nbytes);
1910 +                                            desc, false, nbytes);
1911                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1912 +               glue_fpu_end(fpu_enabled);
1913                 err = blkcipher_walk_done(desc, &walk, nbytes);
1914         }
1915
1916 -       glue_fpu_end(fpu_enabled);
1917         return err;
1918  }
1919  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1920 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1921                           struct scatterlist *src, unsigned int nbytes)
1922  {
1923         const unsigned int bsize = 128 / 8;
1924 -       bool fpu_enabled = false;
1925 +       bool fpu_enabled;
1926         struct blkcipher_walk walk;
1927         int err;
1928
1929 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1930
1931         while ((nbytes = walk.nbytes) >= bsize) {
1932                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1933 -                                            desc, fpu_enabled, nbytes);
1934 +                                            desc, false, nbytes);
1935                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1936 +               glue_fpu_end(fpu_enabled);
1937                 err = blkcipher_walk_done(desc, &walk, nbytes);
1938         }
1939
1940 -       glue_fpu_end(fpu_enabled);
1941 -
1942         if (walk.nbytes) {
1943                 glue_ctr_crypt_final_128bit(
1944                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1945 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1946                           void *tweak_ctx, void *crypt_ctx)
1947  {
1948         const unsigned int bsize = 128 / 8;
1949 -       bool fpu_enabled = false;
1950 +       bool fpu_enabled;
1951         struct blkcipher_walk walk;
1952         int err;
1953
1954 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1955
1956         /* set minimum length to bsize, for tweak_fn */
1957         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1958 -                                    desc, fpu_enabled,
1959 +                                    desc, false,
1960                                      nbytes < bsize ? bsize : nbytes);
1961 -
1962         /* calculate first value of T */
1963         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1964 +       glue_fpu_end(fpu_enabled);
1965
1966         while (nbytes) {
1967 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1968 +                               desc, false, nbytes);
1969                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1970
1971 +               glue_fpu_end(fpu_enabled);
1972                 err = blkcipher_walk_done(desc, &walk, nbytes);
1973                 nbytes = walk.nbytes;
1974         }
1975 -
1976 -       glue_fpu_end(fpu_enabled);
1977 -
1978         return err;
1979  }
1980  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1981 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1982 index bdd9cc59d20f..56d01a339ba4 100644
1983 --- a/arch/x86/entry/common.c
1984 +++ b/arch/x86/entry/common.c
1985 @@ -129,7 +129,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1986
1987  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1988         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1989 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1990 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1991
1992  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1993  {
1994 @@ -145,9 +145,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1995                 /* We have work to do. */
1996                 local_irq_enable();
1997
1998 -               if (cached_flags & _TIF_NEED_RESCHED)
1999 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2000                         schedule();
2001
2002 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2003 +               if (unlikely(current->forced_info.si_signo)) {
2004 +                       struct task_struct *t = current;
2005 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2006 +                       t->forced_info.si_signo = 0;
2007 +               }
2008 +#endif
2009                 if (cached_flags & _TIF_UPROBE)
2010                         uprobe_notify_resume(regs);
2011
2012 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2013 index edba8606b99a..4a3389535fc6 100644
2014 --- a/arch/x86/entry/entry_32.S
2015 +++ b/arch/x86/entry/entry_32.S
2016 @@ -308,8 +308,25 @@ END(ret_from_exception)
2017  ENTRY(resume_kernel)
2018         DISABLE_INTERRUPTS(CLBR_ANY)
2019  need_resched:
2020 +       # preempt count == 0 + NEED_RS set?
2021         cmpl    $0, PER_CPU_VAR(__preempt_count)
2022 +#ifndef CONFIG_PREEMPT_LAZY
2023         jnz     restore_all
2024 +#else
2025 +       jz test_int_off
2026 +
2027 +       # atleast preempt count == 0 ?
2028 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2029 +       jne restore_all
2030 +
2031 +       movl    PER_CPU_VAR(current_task), %ebp
2032 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
2033 +       jnz restore_all
2034 +
2035 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2036 +       jz restore_all
2037 +test_int_off:
2038 +#endif
2039         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2040         jz      restore_all
2041         call    preempt_schedule_irq
2042 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2043 index e7b0e7ff4c58..65916d49dbc9 100644
2044 --- a/arch/x86/entry/entry_64.S
2045 +++ b/arch/x86/entry/entry_64.S
2046 @@ -546,7 +546,23 @@ retint_kernel:
2047         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2048         jnc     1f
2049  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2050 +#ifndef CONFIG_PREEMPT_LAZY
2051         jnz     1f
2052 +#else
2053 +       jz      do_preempt_schedule_irq
2054 +
2055 +       # atleast preempt count == 0 ?
2056 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2057 +       jnz     1f
2058 +
2059 +       movq    PER_CPU_VAR(current_task), %rcx
2060 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2061 +       jnz     1f
2062 +
2063 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2064 +       jnc     1f
2065 +do_preempt_schedule_irq:
2066 +#endif
2067         call    preempt_schedule_irq
2068         jmp     0b
2069  1:
2070 @@ -894,6 +910,7 @@ bad_gs:
2071         jmp     2b
2072         .previous
2073
2074 +#ifndef CONFIG_PREEMPT_RT_FULL
2075  /* Call softirq on interrupt stack. Interrupts are off. */
2076  ENTRY(do_softirq_own_stack)
2077         pushq   %rbp
2078 @@ -906,6 +923,7 @@ ENTRY(do_softirq_own_stack)
2079         decl    PER_CPU_VAR(irq_count)
2080         ret
2081  END(do_softirq_own_stack)
2082 +#endif
2083
2084  #ifdef CONFIG_XEN
2085  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2086 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2087 index 17f218645701..11bd1b7ee6eb 100644
2088 --- a/arch/x86/include/asm/preempt.h
2089 +++ b/arch/x86/include/asm/preempt.h
2090 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2091   * a decrement which hits zero means we have no preempt_count and should
2092   * reschedule.
2093   */
2094 -static __always_inline bool __preempt_count_dec_and_test(void)
2095 +static __always_inline bool ____preempt_count_dec_and_test(void)
2096  {
2097         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2098  }
2099
2100 +static __always_inline bool __preempt_count_dec_and_test(void)
2101 +{
2102 +       if (____preempt_count_dec_and_test())
2103 +               return true;
2104 +#ifdef CONFIG_PREEMPT_LAZY
2105 +       if (current_thread_info()->preempt_lazy_count)
2106 +               return false;
2107 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2108 +#else
2109 +       return false;
2110 +#endif
2111 +}
2112 +
2113  /*
2114   * Returns true when we need to resched and can (barring IRQ state).
2115   */
2116  static __always_inline bool should_resched(int preempt_offset)
2117  {
2118 +#ifdef CONFIG_PREEMPT_LAZY
2119 +       u32 tmp;
2120 +
2121 +       tmp = raw_cpu_read_4(__preempt_count);
2122 +       if (tmp == preempt_offset)
2123 +               return true;
2124 +
2125 +       /* preempt count == 0 ? */
2126 +       tmp &= ~PREEMPT_NEED_RESCHED;
2127 +       if (tmp)
2128 +               return false;
2129 +       if (current_thread_info()->preempt_lazy_count)
2130 +               return false;
2131 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2132 +#else
2133         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2134 +#endif
2135  }
2136
2137  #ifdef CONFIG_PREEMPT
2138 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2139 index 8af22be0fe61..d1328789b759 100644
2140 --- a/arch/x86/include/asm/signal.h
2141 +++ b/arch/x86/include/asm/signal.h
2142 @@ -27,6 +27,19 @@ typedef struct {
2143  #define SA_IA32_ABI    0x02000000u
2144  #define SA_X32_ABI     0x01000000u
2145
2146 +/*
2147 + * Because some traps use the IST stack, we must keep preemption
2148 + * disabled while calling do_trap(), but do_trap() may call
2149 + * force_sig_info() which will grab the signal spin_locks for the
2150 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2151 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2152 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2153 + * trap.
2154 + */
2155 +#if defined(CONFIG_PREEMPT_RT_FULL)
2156 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2157 +#endif
2158 +
2159  #ifndef CONFIG_COMPAT
2160  typedef sigset_t compat_sigset_t;
2161  #endif
2162 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2163 index 58505f01962f..02fa39652cd6 100644
2164 --- a/arch/x86/include/asm/stackprotector.h
2165 +++ b/arch/x86/include/asm/stackprotector.h
2166 @@ -59,7 +59,7 @@
2167   */
2168  static __always_inline void boot_init_stack_canary(void)
2169  {
2170 -       u64 canary;
2171 +       u64 uninitialized_var(canary);
2172         u64 tsc;
2173
2174  #ifdef CONFIG_X86_64
2175 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2176          * of randomness. The TSC only matters for very early init,
2177          * there it already has some randomness on most systems. Later
2178          * on during the bootup the random pool has true entropy too.
2179 +        *
2180 +        * For preempt-rt we need to weaken the randomness a bit, as
2181 +        * we can't call into the random generator from atomic context
2182 +        * due to locking constraints. We just leave canary
2183 +        * uninitialized and use the TSC based randomness on top of it.
2184          */
2185 +#ifndef CONFIG_PREEMPT_RT_FULL
2186         get_random_bytes(&canary, sizeof(canary));
2187 +#endif
2188         tsc = rdtsc();
2189         canary += tsc + (tsc << 32UL);
2190
2191 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2192 index ad6f5eb07a95..5ceb3a1c2b1a 100644
2193 --- a/arch/x86/include/asm/thread_info.h
2194 +++ b/arch/x86/include/asm/thread_info.h
2195 @@ -54,11 +54,14 @@ struct task_struct;
2196
2197  struct thread_info {
2198         unsigned long           flags;          /* low level flags */
2199 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2200 +                                                          <0 => BUG */
2201  };
2202
2203  #define INIT_THREAD_INFO(tsk)                  \
2204  {                                              \
2205         .flags          = 0,                    \
2206 +       .preempt_lazy_count = 0,                \
2207  }
2208
2209  #define init_stack             (init_thread_union.stack)
2210 @@ -67,6 +70,10 @@ struct thread_info {
2211
2212  #include <asm/asm-offsets.h>
2213
2214 +#define GET_THREAD_INFO(reg) \
2215 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2216 +       _ASM_SUB $(THREAD_SIZE),reg ;
2217 +
2218  #endif
2219
2220  /*
2221 @@ -85,6 +92,7 @@ struct thread_info {
2222  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2223  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2224  #define TIF_SECCOMP            8       /* secure computing */
2225 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2226  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2227  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2228  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2229 @@ -108,6 +116,7 @@ struct thread_info {
2230  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2231  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2232  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2233 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2234  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2235  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2236  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2237 @@ -143,6 +152,8 @@ struct thread_info {
2238  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2239  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2240
2241 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2242 +
2243  #define STACK_WARN             (THREAD_SIZE/8)
2244
2245  /*
2246 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2247 index 57ab86d94d64..35d25e27180f 100644
2248 --- a/arch/x86/include/asm/uv/uv_bau.h
2249 +++ b/arch/x86/include/asm/uv/uv_bau.h
2250 @@ -624,9 +624,9 @@ struct bau_control {
2251         cycles_t                send_message;
2252         cycles_t                period_end;
2253         cycles_t                period_time;
2254 -       spinlock_t              uvhub_lock;
2255 -       spinlock_t              queue_lock;
2256 -       spinlock_t              disable_lock;
2257 +       raw_spinlock_t          uvhub_lock;
2258 +       raw_spinlock_t          queue_lock;
2259 +       raw_spinlock_t          disable_lock;
2260         /* tunables */
2261         int                     max_concurr;
2262         int                     max_concurr_const;
2263 @@ -815,15 +815,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2264   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2265   * on equal.
2266   */
2267 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2268 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2269  {
2270 -       spin_lock(lock);
2271 +       raw_spin_lock(lock);
2272         if (atomic_read(v) >= u) {
2273 -               spin_unlock(lock);
2274 +               raw_spin_unlock(lock);
2275                 return 0;
2276         }
2277         atomic_inc(v);
2278 -       spin_unlock(lock);
2279 +       raw_spin_unlock(lock);
2280         return 1;
2281  }
2282
2283 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2284 index b89bef95f63b..c3c1ad2fce5c 100644
2285 --- a/arch/x86/kernel/acpi/boot.c
2286 +++ b/arch/x86/kernel/acpi/boot.c
2287 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2288   *             ->ioapic_mutex
2289   *                     ->ioapic_lock
2290   */
2291 +#ifdef CONFIG_X86_IO_APIC
2292  static DEFINE_MUTEX(acpi_ioapic_lock);
2293 +#endif
2294
2295  /* --------------------------------------------------------------------------
2296                                Boot-time Configuration
2297 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2298 index cf89928dbd46..18b5ec2a71df 100644
2299 --- a/arch/x86/kernel/apic/io_apic.c
2300 +++ b/arch/x86/kernel/apic/io_apic.c
2301 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2302  static inline bool ioapic_irqd_mask(struct irq_data *data)
2303  {
2304         /* If we are moving the irq we need to mask it */
2305 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2306 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2307 +                    !irqd_irq_inprogress(data))) {
2308                 mask_ioapic_irq(data);
2309                 return true;
2310         }
2311 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2312 index c62e015b126c..0cc71257fca6 100644
2313 --- a/arch/x86/kernel/asm-offsets.c
2314 +++ b/arch/x86/kernel/asm-offsets.c
2315 @@ -36,6 +36,7 @@ void common(void) {
2316
2317         BLANK();
2318         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2319 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2320         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2321
2322         BLANK();
2323 @@ -91,4 +92,5 @@ void common(void) {
2324
2325         BLANK();
2326         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2327 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2328  }
2329 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2330 index 8ca5f8ad008e..edcbd18b3189 100644
2331 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2332 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2333 @@ -41,6 +41,8 @@
2334  #include <linux/debugfs.h>
2335  #include <linux/irq_work.h>
2336  #include <linux/export.h>
2337 +#include <linux/jiffies.h>
2338 +#include <linux/swork.h>
2339  #include <linux/jump_label.h>
2340
2341  #include <asm/processor.h>
2342 @@ -1306,7 +1308,7 @@ void mce_log_therm_throt_event(__u64 status)
2343  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2344
2345  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2346 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2347 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2348
2349  static unsigned long mce_adjust_timer_default(unsigned long interval)
2350  {
2351 @@ -1315,32 +1317,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2352
2353  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2354
2355 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2356 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2357  {
2358 -       unsigned long when = jiffies + interval;
2359 -       unsigned long flags;
2360 -
2361 -       local_irq_save(flags);
2362 -
2363 -       if (timer_pending(t)) {
2364 -               if (time_before(when, t->expires))
2365 -                       mod_timer(t, when);
2366 -       } else {
2367 -               t->expires = round_jiffies(when);
2368 -               add_timer_on(t, smp_processor_id());
2369 -       }
2370 -
2371 -       local_irq_restore(flags);
2372 +       if (!interval)
2373 +               return HRTIMER_NORESTART;
2374 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2375 +       return HRTIMER_RESTART;
2376  }
2377
2378 -static void mce_timer_fn(unsigned long data)
2379 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2380  {
2381 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2382 -       int cpu = smp_processor_id();
2383         unsigned long iv;
2384
2385 -       WARN_ON(cpu != data);
2386 -
2387         iv = __this_cpu_read(mce_next_interval);
2388
2389         if (mce_available(this_cpu_ptr(&cpu_info))) {
2390 @@ -1363,7 +1351,7 @@ static void mce_timer_fn(unsigned long data)
2391
2392  done:
2393         __this_cpu_write(mce_next_interval, iv);
2394 -       __restart_timer(t, iv);
2395 +       return __restart_timer(timer, iv);
2396  }
2397
2398  /*
2399 @@ -1371,7 +1359,7 @@ static void mce_timer_fn(unsigned long data)
2400   */
2401  void mce_timer_kick(unsigned long interval)
2402  {
2403 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2404 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2405         unsigned long iv = __this_cpu_read(mce_next_interval);
2406
2407         __restart_timer(t, interval);
2408 @@ -1386,7 +1374,7 @@ static void mce_timer_delete_all(void)
2409         int cpu;
2410
2411         for_each_online_cpu(cpu)
2412 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2413 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2414  }
2415
2416  static void mce_do_trigger(struct work_struct *work)
2417 @@ -1396,6 +1384,56 @@ static void mce_do_trigger(struct work_struct *work)
2418
2419  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2420
2421 +static void __mce_notify_work(struct swork_event *event)
2422 +{
2423 +       /* Not more than two messages every minute */
2424 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2425 +
2426 +       /* wake processes polling /dev/mcelog */
2427 +       wake_up_interruptible(&mce_chrdev_wait);
2428 +
2429 +       /*
2430 +        * There is no risk of missing notifications because
2431 +        * work_pending is always cleared before the function is
2432 +        * executed.
2433 +        */
2434 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2435 +               schedule_work(&mce_trigger_work);
2436 +
2437 +       if (__ratelimit(&ratelimit))
2438 +               pr_info(HW_ERR "Machine check events logged\n");
2439 +}
2440 +
2441 +#ifdef CONFIG_PREEMPT_RT_FULL
2442 +static bool notify_work_ready __read_mostly;
2443 +static struct swork_event notify_work;
2444 +
2445 +static int mce_notify_work_init(void)
2446 +{
2447 +       int err;
2448 +
2449 +       err = swork_get();
2450 +       if (err)
2451 +               return err;
2452 +
2453 +       INIT_SWORK(&notify_work, __mce_notify_work);
2454 +       notify_work_ready = true;
2455 +       return 0;
2456 +}
2457 +
2458 +static void mce_notify_work(void)
2459 +{
2460 +       if (notify_work_ready)
2461 +               swork_queue(&notify_work);
2462 +}
2463 +#else
2464 +static void mce_notify_work(void)
2465 +{
2466 +       __mce_notify_work(NULL);
2467 +}
2468 +static inline int mce_notify_work_init(void) { return 0; }
2469 +#endif
2470 +
2471  /*
2472   * Notify the user(s) about new machine check events.
2473   * Can be called from interrupt context, but not from machine check/NMI
2474 @@ -1403,19 +1441,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2475   */
2476  int mce_notify_irq(void)
2477  {
2478 -       /* Not more than two messages every minute */
2479 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2480 -
2481         if (test_and_clear_bit(0, &mce_need_notify)) {
2482 -               /* wake processes polling /dev/mcelog */
2483 -               wake_up_interruptible(&mce_chrdev_wait);
2484 -
2485 -               if (mce_helper[0])
2486 -                       schedule_work(&mce_trigger_work);
2487 -
2488 -               if (__ratelimit(&ratelimit))
2489 -                       pr_info(HW_ERR "Machine check events logged\n");
2490 -
2491 +               mce_notify_work();
2492                 return 1;
2493         }
2494         return 0;
2495 @@ -1721,7 +1748,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2496         }
2497  }
2498
2499 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2500 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2501  {
2502         unsigned long iv = check_interval * HZ;
2503
2504 @@ -1730,16 +1757,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2505
2506         per_cpu(mce_next_interval, cpu) = iv;
2507
2508 -       t->expires = round_jiffies(jiffies + iv);
2509 -       add_timer_on(t, cpu);
2510 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2511 +                       0, HRTIMER_MODE_REL_PINNED);
2512  }
2513
2514  static void __mcheck_cpu_init_timer(void)
2515  {
2516 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2517 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2518         unsigned int cpu = smp_processor_id();
2519
2520 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2521 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2522 +       t->function = mce_timer_fn;
2523         mce_start_timer(cpu, t);
2524  }
2525
2526 @@ -2464,6 +2492,8 @@ static void mce_disable_cpu(void *h)
2527         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2528                 return;
2529
2530 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2531 +
2532         if (!(action & CPU_TASKS_FROZEN))
2533                 cmci_clear();
2534
2535 @@ -2486,6 +2516,7 @@ static void mce_reenable_cpu(void *h)
2536                 if (b->init)
2537                         wrmsrl(msr_ops.ctl(i), b->ctl);
2538         }
2539 +       __mcheck_cpu_init_timer();
2540  }
2541
2542  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2543 @@ -2493,7 +2524,6 @@ static int
2544  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2545  {
2546         unsigned int cpu = (unsigned long)hcpu;
2547 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2548
2549         switch (action & ~CPU_TASKS_FROZEN) {
2550         case CPU_ONLINE:
2551 @@ -2513,11 +2543,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2552                 break;
2553         case CPU_DOWN_PREPARE:
2554                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2555 -               del_timer_sync(t);
2556                 break;
2557         case CPU_DOWN_FAILED:
2558                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2559 -               mce_start_timer(cpu, t);
2560                 break;
2561         }
2562
2563 @@ -2556,6 +2584,10 @@ static __init int mcheck_init_device(void)
2564                 goto err_out;
2565         }
2566
2567 +       err = mce_notify_work_init();
2568 +       if (err)
2569 +               goto err_out;
2570 +
2571         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2572                 err = -ENOMEM;
2573                 goto err_out;
2574 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2575 index 1f38d9a4d9de..053bf3b2ef39 100644
2576 --- a/arch/x86/kernel/irq_32.c
2577 +++ b/arch/x86/kernel/irq_32.c
2578 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2579                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2580  }
2581
2582 +#ifndef CONFIG_PREEMPT_RT_FULL
2583  void do_softirq_own_stack(void)
2584  {
2585         struct irq_stack *irqstk;
2586 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2587
2588         call_on_stack(__do_softirq, isp);
2589  }
2590 +#endif
2591
2592  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2593  {
2594 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2595 index bd7be8efdc4c..b3b0a7f7b1ca 100644
2596 --- a/arch/x86/kernel/process_32.c
2597 +++ b/arch/x86/kernel/process_32.c
2598 @@ -35,6 +35,7 @@
2599  #include <linux/uaccess.h>
2600  #include <linux/io.h>
2601  #include <linux/kdebug.h>
2602 +#include <linux/highmem.h>
2603
2604  #include <asm/pgtable.h>
2605  #include <asm/ldt.h>
2606 @@ -195,6 +196,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2607  }
2608  EXPORT_SYMBOL_GPL(start_thread);
2609
2610 +#ifdef CONFIG_PREEMPT_RT_FULL
2611 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2612 +{
2613 +       int i;
2614 +
2615 +       /*
2616 +        * Clear @prev's kmap_atomic mappings
2617 +        */
2618 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2619 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2620 +               pte_t *ptep = kmap_pte - idx;
2621 +
2622 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2623 +       }
2624 +       /*
2625 +        * Restore @next_p's kmap_atomic mappings
2626 +        */
2627 +       for (i = 0; i < next_p->kmap_idx; i++) {
2628 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2629 +
2630 +               if (!pte_none(next_p->kmap_pte[i]))
2631 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2632 +       }
2633 +}
2634 +#else
2635 +static inline void
2636 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2637 +#endif
2638 +
2639
2640  /*
2641   *     switch_to(x,y) should switch tasks from x to y.
2642 @@ -271,6 +301,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2643                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2644                 __switch_to_xtra(prev_p, next_p, tss);
2645
2646 +       switch_kmaps(prev_p, next_p);
2647 +
2648         /*
2649          * Leave lazy mode, flushing any hypercalls made here.
2650          * This must be done before restoring TLS segments so
2651 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2652 index b24b3c6d686e..02a062b0de5d 100644
2653 --- a/arch/x86/kvm/lapic.c
2654 +++ b/arch/x86/kvm/lapic.c
2655 @@ -1944,6 +1944,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2656         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2657                      HRTIMER_MODE_ABS_PINNED);
2658         apic->lapic_timer.timer.function = apic_timer_fn;
2659 +       apic->lapic_timer.timer.irqsafe = 1;
2660
2661         /*
2662          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2663 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2664 index 02d45296a97c..4963bd51d20b 100644
2665 --- a/arch/x86/kvm/x86.c
2666 +++ b/arch/x86/kvm/x86.c
2667 @@ -5966,6 +5966,13 @@ int kvm_arch_init(void *opaque)
2668                 goto out;
2669         }
2670
2671 +#ifdef CONFIG_PREEMPT_RT_FULL
2672 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2673 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2674 +               return -EOPNOTSUPP;
2675 +       }
2676 +#endif
2677 +
2678         r = kvm_mmu_module_init();
2679         if (r)
2680                 goto out_free_percpu;
2681 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2682 index 6d18b70ed5a9..f752724c22e8 100644
2683 --- a/arch/x86/mm/highmem_32.c
2684 +++ b/arch/x86/mm/highmem_32.c
2685 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2686   */
2687  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2688  {
2689 +       pte_t pte = mk_pte(page, prot);
2690         unsigned long vaddr;
2691         int idx, type;
2692
2693 -       preempt_disable();
2694 +       preempt_disable_nort();
2695         pagefault_disable();
2696
2697         if (!PageHighMem(page))
2698 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2699         idx = type + KM_TYPE_NR*smp_processor_id();
2700         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2701         BUG_ON(!pte_none(*(kmap_pte-idx)));
2702 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2703 +#ifdef CONFIG_PREEMPT_RT_FULL
2704 +       current->kmap_pte[type] = pte;
2705 +#endif
2706 +       set_pte(kmap_pte-idx, pte);
2707         arch_flush_lazy_mmu_mode();
2708
2709         return (void *)vaddr;
2710 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2711                  * is a bad idea also, in case the page changes cacheability
2712                  * attributes or becomes a protected page in a hypervisor.
2713                  */
2714 +#ifdef CONFIG_PREEMPT_RT_FULL
2715 +               current->kmap_pte[type] = __pte(0);
2716 +#endif
2717                 kpte_clear_flush(kmap_pte-idx, vaddr);
2718                 kmap_atomic_idx_pop();
2719                 arch_flush_lazy_mmu_mode();
2720 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2721  #endif
2722
2723         pagefault_enable();
2724 -       preempt_enable();
2725 +       preempt_enable_nort();
2726  }
2727  EXPORT_SYMBOL(__kunmap_atomic);
2728
2729 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2730 index ada98b39b8ad..585f6829653b 100644
2731 --- a/arch/x86/mm/iomap_32.c
2732 +++ b/arch/x86/mm/iomap_32.c
2733 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2734
2735  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2736  {
2737 +       pte_t pte = pfn_pte(pfn, prot);
2738         unsigned long vaddr;
2739         int idx, type;
2740
2741 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2742         type = kmap_atomic_idx_push();
2743         idx = type + KM_TYPE_NR * smp_processor_id();
2744         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2745 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2746 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2747 +
2748 +#ifdef CONFIG_PREEMPT_RT_FULL
2749 +       current->kmap_pte[type] = pte;
2750 +#endif
2751 +       set_pte(kmap_pte - idx, pte);
2752         arch_flush_lazy_mmu_mode();
2753
2754         return (void *)vaddr;
2755 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2756                  * is a bad idea also, in case the page changes cacheability
2757                  * attributes or becomes a protected page in a hypervisor.
2758                  */
2759 +#ifdef CONFIG_PREEMPT_RT_FULL
2760 +               current->kmap_pte[type] = __pte(0);
2761 +#endif
2762                 kpte_clear_flush(kmap_pte-idx, vaddr);
2763                 kmap_atomic_idx_pop();
2764         }
2765 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2766 index e3353c97d086..01664968555c 100644
2767 --- a/arch/x86/mm/pageattr.c
2768 +++ b/arch/x86/mm/pageattr.c
2769 @@ -214,7 +214,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
2770                             int in_flags, struct page **pages)
2771  {
2772         unsigned int i, level;
2773 +#ifdef CONFIG_PREEMPT
2774 +       /*
2775 +        * Avoid wbinvd() because it causes latencies on all CPUs,
2776 +        * regardless of any CPU isolation that may be in effect.
2777 +        */
2778 +       unsigned long do_wbinvd = 0;
2779 +#else
2780         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
2781 +#endif
2782
2783         BUG_ON(irqs_disabled());
2784
2785 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2786 index 9e42842e924a..5398f97172f9 100644
2787 --- a/arch/x86/platform/uv/tlb_uv.c
2788 +++ b/arch/x86/platform/uv/tlb_uv.c
2789 @@ -748,9 +748,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2790
2791                 quiesce_local_uvhub(hmaster);
2792
2793 -               spin_lock(&hmaster->queue_lock);
2794 +               raw_spin_lock(&hmaster->queue_lock);
2795                 reset_with_ipi(&bau_desc->distribution, bcp);
2796 -               spin_unlock(&hmaster->queue_lock);
2797 +               raw_spin_unlock(&hmaster->queue_lock);
2798
2799                 end_uvhub_quiesce(hmaster);
2800
2801 @@ -770,9 +770,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2802
2803                 quiesce_local_uvhub(hmaster);
2804
2805 -               spin_lock(&hmaster->queue_lock);
2806 +               raw_spin_lock(&hmaster->queue_lock);
2807                 reset_with_ipi(&bau_desc->distribution, bcp);
2808 -               spin_unlock(&hmaster->queue_lock);
2809 +               raw_spin_unlock(&hmaster->queue_lock);
2810
2811                 end_uvhub_quiesce(hmaster);
2812
2813 @@ -793,7 +793,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2814         cycles_t tm1;
2815
2816         hmaster = bcp->uvhub_master;
2817 -       spin_lock(&hmaster->disable_lock);
2818 +       raw_spin_lock(&hmaster->disable_lock);
2819         if (!bcp->baudisabled) {
2820                 stat->s_bau_disabled++;
2821                 tm1 = get_cycles();
2822 @@ -806,7 +806,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2823                         }
2824                 }
2825         }
2826 -       spin_unlock(&hmaster->disable_lock);
2827 +       raw_spin_unlock(&hmaster->disable_lock);
2828  }
2829
2830  static void count_max_concurr(int stat, struct bau_control *bcp,
2831 @@ -869,7 +869,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2832   */
2833  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2834  {
2835 -       spinlock_t *lock = &hmaster->uvhub_lock;
2836 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2837         atomic_t *v;
2838
2839         v = &hmaster->active_descriptor_count;
2840 @@ -1002,7 +1002,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2841         struct bau_control *hmaster;
2842
2843         hmaster = bcp->uvhub_master;
2844 -       spin_lock(&hmaster->disable_lock);
2845 +       raw_spin_lock(&hmaster->disable_lock);
2846         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2847                 stat->s_bau_reenabled++;
2848                 for_each_present_cpu(tcpu) {
2849 @@ -1014,10 +1014,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2850                                 tbcp->period_giveups = 0;
2851                         }
2852                 }
2853 -               spin_unlock(&hmaster->disable_lock);
2854 +               raw_spin_unlock(&hmaster->disable_lock);
2855                 return 0;
2856         }
2857 -       spin_unlock(&hmaster->disable_lock);
2858 +       raw_spin_unlock(&hmaster->disable_lock);
2859         return -1;
2860  }
2861
2862 @@ -1940,9 +1940,9 @@ static void __init init_per_cpu_tunables(void)
2863                 bcp->cong_reps                  = congested_reps;
2864                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2865                 bcp->giveup_limit               = giveup_limit;
2866 -               spin_lock_init(&bcp->queue_lock);
2867 -               spin_lock_init(&bcp->uvhub_lock);
2868 -               spin_lock_init(&bcp->disable_lock);
2869 +               raw_spin_lock_init(&bcp->queue_lock);
2870 +               raw_spin_lock_init(&bcp->uvhub_lock);
2871 +               raw_spin_lock_init(&bcp->disable_lock);
2872         }
2873  }
2874
2875 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2876 index b333fc45f9ec..8b85916e6986 100644
2877 --- a/arch/x86/platform/uv/uv_time.c
2878 +++ b/arch/x86/platform/uv/uv_time.c
2879 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2880
2881  /* There is one of these allocated per node */
2882  struct uv_rtc_timer_head {
2883 -       spinlock_t      lock;
2884 +       raw_spinlock_t  lock;
2885         /* next cpu waiting for timer, local node relative: */
2886         int             next_cpu;
2887         /* number of cpus on this node: */
2888 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2889                                 uv_rtc_deallocate_timers();
2890                                 return -ENOMEM;
2891                         }
2892 -                       spin_lock_init(&head->lock);
2893 +                       raw_spin_lock_init(&head->lock);
2894                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2895                         head->next_cpu = -1;
2896                         blade_info[bid] = head;
2897 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2898         unsigned long flags;
2899         int next_cpu;
2900
2901 -       spin_lock_irqsave(&head->lock, flags);
2902 +       raw_spin_lock_irqsave(&head->lock, flags);
2903
2904         next_cpu = head->next_cpu;
2905         *t = expires;
2906 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2907                 if (uv_setup_intr(cpu, expires)) {
2908                         *t = ULLONG_MAX;
2909                         uv_rtc_find_next_timer(head, pnode);
2910 -                       spin_unlock_irqrestore(&head->lock, flags);
2911 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2912                         return -ETIME;
2913                 }
2914         }
2915
2916 -       spin_unlock_irqrestore(&head->lock, flags);
2917 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2918         return 0;
2919  }
2920
2921 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2922         unsigned long flags;
2923         int rc = 0;
2924
2925 -       spin_lock_irqsave(&head->lock, flags);
2926 +       raw_spin_lock_irqsave(&head->lock, flags);
2927
2928         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2929                 rc = 1;
2930 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2931                         uv_rtc_find_next_timer(head, pnode);
2932         }
2933
2934 -       spin_unlock_irqrestore(&head->lock, flags);
2935 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2936
2937         return rc;
2938  }
2939 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2940  static cycle_t uv_read_rtc(struct clocksource *cs)
2941  {
2942         unsigned long offset;
2943 +       cycle_t cycles;
2944
2945 +       preempt_disable();
2946         if (uv_get_min_hub_revision_id() == 1)
2947                 offset = 0;
2948         else
2949                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2950
2951 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2952 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2953 +       preempt_enable();
2954 +
2955 +       return cycles;
2956  }
2957
2958  /*
2959 diff --git a/block/blk-core.c b/block/blk-core.c
2960 index b1c76aa73492..5808a85a7974 100644
2961 --- a/block/blk-core.c
2962 +++ b/block/blk-core.c
2963 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
2964
2965         INIT_LIST_HEAD(&rq->queuelist);
2966         INIT_LIST_HEAD(&rq->timeout_list);
2967 +#ifdef CONFIG_PREEMPT_RT_FULL
2968 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2969 +#endif
2970         rq->cpu = -1;
2971         rq->q = q;
2972         rq->__sector = (sector_t) -1;
2973 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
2974   **/
2975  void blk_start_queue(struct request_queue *q)
2976  {
2977 -       WARN_ON(!in_interrupt() && !irqs_disabled());
2978 +       WARN_ON_NONRT(!in_interrupt() && !irqs_disabled());
2979
2980         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2981         __blk_run_queue(q);
2982 @@ -660,7 +663,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
2983                 if (nowait)
2984                         return -EBUSY;
2985
2986 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2987 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2988                                 !atomic_read(&q->mq_freeze_depth) ||
2989                                 blk_queue_dying(q));
2990                 if (blk_queue_dying(q))
2991 @@ -680,7 +683,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
2992         struct request_queue *q =
2993                 container_of(ref, struct request_queue, q_usage_counter);
2994
2995 -       wake_up_all(&q->mq_freeze_wq);
2996 +       swake_up_all(&q->mq_freeze_wq);
2997  }
2998
2999  static void blk_rq_timed_out_timer(unsigned long data)
3000 @@ -750,7 +753,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3001         q->bypass_depth = 1;
3002         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3003
3004 -       init_waitqueue_head(&q->mq_freeze_wq);
3005 +       init_swait_queue_head(&q->mq_freeze_wq);
3006
3007         /*
3008          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3009 @@ -3202,7 +3205,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3010                 blk_run_queue_async(q);
3011         else
3012                 __blk_run_queue(q);
3013 -       spin_unlock(q->queue_lock);
3014 +       spin_unlock_irq(q->queue_lock);
3015  }
3016
3017  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3018 @@ -3250,7 +3253,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3019  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3020  {
3021         struct request_queue *q;
3022 -       unsigned long flags;
3023         struct request *rq;
3024         LIST_HEAD(list);
3025         unsigned int depth;
3026 @@ -3270,11 +3272,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3027         q = NULL;
3028         depth = 0;
3029
3030 -       /*
3031 -        * Save and disable interrupts here, to avoid doing it for every
3032 -        * queue lock we have to take.
3033 -        */
3034 -       local_irq_save(flags);
3035         while (!list_empty(&list)) {
3036                 rq = list_entry_rq(list.next);
3037                 list_del_init(&rq->queuelist);
3038 @@ -3287,7 +3284,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3039                                 queue_unplugged(q, depth, from_schedule);
3040                         q = rq->q;
3041                         depth = 0;
3042 -                       spin_lock(q->queue_lock);
3043 +                       spin_lock_irq(q->queue_lock);
3044                 }
3045
3046                 /*
3047 @@ -3314,8 +3311,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3048          */
3049         if (q)
3050                 queue_unplugged(q, depth, from_schedule);
3051 -
3052 -       local_irq_restore(flags);
3053  }
3054
3055  void blk_finish_plug(struct blk_plug *plug)
3056 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3057 index 381cb50a673c..dc8785233d94 100644
3058 --- a/block/blk-ioc.c
3059 +++ b/block/blk-ioc.c
3060 @@ -7,6 +7,7 @@
3061  #include <linux/bio.h>
3062  #include <linux/blkdev.h>
3063  #include <linux/slab.h>
3064 +#include <linux/delay.h>
3065
3066  #include "blk.h"
3067
3068 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3069                         spin_unlock(q->queue_lock);
3070                 } else {
3071                         spin_unlock_irqrestore(&ioc->lock, flags);
3072 -                       cpu_relax();
3073 +                       cpu_chill();
3074                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3075                 }
3076         }
3077 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3078                         spin_unlock(icq->q->queue_lock);
3079                 } else {
3080                         spin_unlock_irqrestore(&ioc->lock, flags);
3081 -                       cpu_relax();
3082 +                       cpu_chill();
3083                         goto retry;
3084                 }
3085         }
3086 diff --git a/block/blk-mq.c b/block/blk-mq.c
3087 index 7b597ec4e9c5..48c9652a701c 100644
3088 --- a/block/blk-mq.c
3089 +++ b/block/blk-mq.c
3090 @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3091
3092  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3093  {
3094 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3095 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3096  }
3097
3098  /*
3099 @@ -110,7 +110,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3100         WARN_ON_ONCE(freeze_depth < 0);
3101         if (!freeze_depth) {
3102                 percpu_ref_reinit(&q->q_usage_counter);
3103 -               wake_up_all(&q->mq_freeze_wq);
3104 +               swake_up_all(&q->mq_freeze_wq);
3105         }
3106  }
3107  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3108 @@ -129,7 +129,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3109          * dying, we need to ensure that processes currently waiting on
3110          * the queue are notified as well.
3111          */
3112 -       wake_up_all(&q->mq_freeze_wq);
3113 +       swake_up_all(&q->mq_freeze_wq);
3114  }
3115
3116  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3117 @@ -177,6 +177,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3118         rq->resid_len = 0;
3119         rq->sense = NULL;
3120
3121 +#ifdef CONFIG_PREEMPT_RT_FULL
3122 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3123 +#endif
3124         INIT_LIST_HEAD(&rq->timeout_list);
3125         rq->timeout = 0;
3126
3127 @@ -345,6 +348,17 @@ void blk_mq_end_request(struct request *rq, int error)
3128  }
3129  EXPORT_SYMBOL(blk_mq_end_request);
3130
3131 +#ifdef CONFIG_PREEMPT_RT_FULL
3132 +
3133 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3134 +{
3135 +       struct request *rq = container_of(work, struct request, work);
3136 +
3137 +       rq->q->softirq_done_fn(rq);
3138 +}
3139 +
3140 +#else
3141 +
3142  static void __blk_mq_complete_request_remote(void *data)
3143  {
3144         struct request *rq = data;
3145 @@ -352,6 +366,8 @@ static void __blk_mq_complete_request_remote(void *data)
3146         rq->q->softirq_done_fn(rq);
3147  }
3148
3149 +#endif
3150 +
3151  static void blk_mq_ipi_complete_request(struct request *rq)
3152  {
3153         struct blk_mq_ctx *ctx = rq->mq_ctx;
3154 @@ -363,19 +379,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3155                 return;
3156         }
3157
3158 -       cpu = get_cpu();
3159 +       cpu = get_cpu_light();
3160         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3161                 shared = cpus_share_cache(cpu, ctx->cpu);
3162
3163         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3164 +#ifdef CONFIG_PREEMPT_RT_FULL
3165 +               schedule_work_on(ctx->cpu, &rq->work);
3166 +#else
3167                 rq->csd.func = __blk_mq_complete_request_remote;
3168                 rq->csd.info = rq;
3169                 rq->csd.flags = 0;
3170                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3171 +#endif
3172         } else {
3173                 rq->q->softirq_done_fn(rq);
3174         }
3175 -       put_cpu();
3176 +       put_cpu_light();
3177  }
3178
3179  static void __blk_mq_complete_request(struct request *rq)
3180 @@ -906,14 +926,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3181                 return;
3182
3183         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3184 -               int cpu = get_cpu();
3185 +               int cpu = get_cpu_light();
3186                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3187                         __blk_mq_run_hw_queue(hctx);
3188 -                       put_cpu();
3189 +                       put_cpu_light();
3190                         return;
3191                 }
3192
3193 -               put_cpu();
3194 +               put_cpu_light();
3195         }
3196
3197         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3198 diff --git a/block/blk-mq.h b/block/blk-mq.h
3199 index e5d25249028c..1e846b842eab 100644
3200 --- a/block/blk-mq.h
3201 +++ b/block/blk-mq.h
3202 @@ -72,12 +72,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3203   */
3204  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3205  {
3206 -       return __blk_mq_get_ctx(q, get_cpu());
3207 +       return __blk_mq_get_ctx(q, get_cpu_light());
3208  }
3209
3210  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3211  {
3212 -       put_cpu();
3213 +       put_cpu_light();
3214  }
3215
3216  struct blk_mq_alloc_data {
3217 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3218 index 06cf9807f49a..c40342643ca0 100644
3219 --- a/block/blk-softirq.c
3220 +++ b/block/blk-softirq.c
3221 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3222                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3223
3224         local_irq_restore(flags);
3225 +       preempt_check_resched_rt();
3226  }
3227
3228  /*
3229 @@ -89,6 +90,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
3230                          this_cpu_ptr(&blk_cpu_done));
3231         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3232         local_irq_enable();
3233 +       preempt_check_resched_rt();
3234
3235         return 0;
3236  }
3237 @@ -141,6 +143,7 @@ void __blk_complete_request(struct request *req)
3238                 goto do_local;
3239
3240         local_irq_restore(flags);
3241 +       preempt_check_resched_rt();
3242  }
3243
3244  /**
3245 diff --git a/block/bounce.c b/block/bounce.c
3246 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3247 --- a/block/bounce.c
3248 +++ b/block/bounce.c
3249 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3250         unsigned long flags;
3251         unsigned char *vto;
3252
3253 -       local_irq_save(flags);
3254 +       local_irq_save_nort(flags);
3255         vto = kmap_atomic(to->bv_page);
3256         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3257         kunmap_atomic(vto);
3258 -       local_irq_restore(flags);
3259 +       local_irq_restore_nort(flags);
3260  }
3261
3262  #else /* CONFIG_HIGHMEM */
3263 diff --git a/crypto/algapi.c b/crypto/algapi.c
3264 index 1fad2a6b3bbb..ecb7315426a9 100644
3265 --- a/crypto/algapi.c
3266 +++ b/crypto/algapi.c
3267 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3268
3269  int crypto_register_notifier(struct notifier_block *nb)
3270  {
3271 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3272 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3273  }
3274  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3275
3276  int crypto_unregister_notifier(struct notifier_block *nb)
3277  {
3278 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3279 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3280  }
3281  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3282
3283 diff --git a/crypto/api.c b/crypto/api.c
3284 index bbc147cb5dec..bc1a848f02ec 100644
3285 --- a/crypto/api.c
3286 +++ b/crypto/api.c
3287 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3288  DECLARE_RWSEM(crypto_alg_sem);
3289  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3290
3291 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3292 +SRCU_NOTIFIER_HEAD(crypto_chain);
3293  EXPORT_SYMBOL_GPL(crypto_chain);
3294
3295  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3296 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3297  {
3298         int ok;
3299
3300 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3301 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3302         if (ok == NOTIFY_DONE) {
3303                 request_module("cryptomgr");
3304 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3305 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3306         }
3307
3308         return ok;
3309 diff --git a/crypto/internal.h b/crypto/internal.h
3310 index 7eefcdb00227..0ecc7f5a2f40 100644
3311 --- a/crypto/internal.h
3312 +++ b/crypto/internal.h
3313 @@ -47,7 +47,7 @@ struct crypto_larval {
3314
3315  extern struct list_head crypto_alg_list;
3316  extern struct rw_semaphore crypto_alg_sem;
3317 -extern struct blocking_notifier_head crypto_chain;
3318 +extern struct srcu_notifier_head crypto_chain;
3319
3320  #ifdef CONFIG_PROC_FS
3321  void __init crypto_init_proc(void);
3322 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3323
3324  static inline void crypto_notify(unsigned long val, void *v)
3325  {
3326 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3327 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3328  }
3329
3330  #endif /* _CRYPTO_INTERNAL_H */
3331 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3332 index 750fa824d42c..441edf51484a 100644
3333 --- a/drivers/acpi/acpica/acglobal.h
3334 +++ b/drivers/acpi/acpica/acglobal.h
3335 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3336   * interrupt level
3337   */
3338  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3339 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3340 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3341  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3342
3343  /* Mutex for _OSI support */
3344 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3345 index 3b7fb99362b6..696bf8e62afb 100644
3346 --- a/drivers/acpi/acpica/hwregs.c
3347 +++ b/drivers/acpi/acpica/hwregs.c
3348 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3349                           ACPI_BITMASK_ALL_FIXED_STATUS,
3350                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3351
3352 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3353 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3354
3355         /* Clear the fixed events in PM1 A/B */
3356
3357         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3358                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3359
3360 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3361 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3362
3363         if (ACPI_FAILURE(status)) {
3364                 goto exit;
3365 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3366 index 98c26ff39409..6e236f2ea791 100644
3367 --- a/drivers/acpi/acpica/hwxface.c
3368 +++ b/drivers/acpi/acpica/hwxface.c
3369 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3370                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3371         }
3372
3373 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3374 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3375
3376         /*
3377          * At this point, we know that the parent register is one of the
3378 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3379
3380  unlock_and_exit:
3381
3382 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3383 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3384         return_ACPI_STATUS(status);
3385  }
3386
3387 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3388 index 15073375bd00..357e7ca5a587 100644
3389 --- a/drivers/acpi/acpica/utmutex.c
3390 +++ b/drivers/acpi/acpica/utmutex.c
3391 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3392                 return_ACPI_STATUS (status);
3393         }
3394
3395 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3396 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3397         if (ACPI_FAILURE (status)) {
3398                 return_ACPI_STATUS (status);
3399         }
3400 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3401         /* Delete the spinlocks */
3402
3403         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3404 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3405 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3406         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3407
3408         /* Delete the reader/writer lock */
3409 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3410 index 051b6158d1b7..7ad293bef6ed 100644
3411 --- a/drivers/ata/libata-sff.c
3412 +++ b/drivers/ata/libata-sff.c
3413 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3414         unsigned long flags;
3415         unsigned int consumed;
3416
3417 -       local_irq_save(flags);
3418 +       local_irq_save_nort(flags);
3419         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3420 -       local_irq_restore(flags);
3421 +       local_irq_restore_nort(flags);
3422
3423         return consumed;
3424  }
3425 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3426                 unsigned long flags;
3427
3428                 /* FIXME: use a bounce buffer */
3429 -               local_irq_save(flags);
3430 +               local_irq_save_nort(flags);
3431                 buf = kmap_atomic(page);
3432
3433                 /* do the actual data transfer */
3434 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3435                                        do_write);
3436
3437                 kunmap_atomic(buf);
3438 -               local_irq_restore(flags);
3439 +               local_irq_restore_nort(flags);
3440         } else {
3441                 buf = page_address(page);
3442                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3443 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3444                 unsigned long flags;
3445
3446                 /* FIXME: use bounce buffer */
3447 -               local_irq_save(flags);
3448 +               local_irq_save_nort(flags);
3449                 buf = kmap_atomic(page);
3450
3451                 /* do the actual data transfer */
3452 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3453                                                                 count, rw);
3454
3455                 kunmap_atomic(buf);
3456 -               local_irq_restore(flags);
3457 +               local_irq_restore_nort(flags);
3458         } else {
3459                 buf = page_address(page);
3460                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3461 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3462 index 4b5cd3a7b2b6..8c93ee150ee8 100644
3463 --- a/drivers/block/zram/zcomp.c
3464 +++ b/drivers/block/zram/zcomp.c
3465 @@ -118,12 +118,20 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3466
3467  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3468  {
3469 -       return *get_cpu_ptr(comp->stream);
3470 +       struct zcomp_strm *zstrm;
3471 +
3472 +       zstrm = *get_local_ptr(comp->stream);
3473 +       spin_lock(&zstrm->zcomp_lock);
3474 +       return zstrm;
3475  }
3476
3477  void zcomp_stream_put(struct zcomp *comp)
3478  {
3479 -       put_cpu_ptr(comp->stream);
3480 +       struct zcomp_strm *zstrm;
3481 +
3482 +       zstrm = *this_cpu_ptr(comp->stream);
3483 +       spin_unlock(&zstrm->zcomp_lock);
3484 +       put_local_ptr(zstrm);
3485  }
3486
3487  int zcomp_compress(struct zcomp_strm *zstrm,
3488 @@ -174,6 +182,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3489                         pr_err("Can't allocate a compression stream\n");
3490                         return NOTIFY_BAD;
3491                 }
3492 +               spin_lock_init(&zstrm->zcomp_lock);
3493                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3494                 break;
3495         case CPU_DEAD:
3496 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3497 index 478cac2ed465..f7a6efdc3285 100644
3498 --- a/drivers/block/zram/zcomp.h
3499 +++ b/drivers/block/zram/zcomp.h
3500 @@ -14,6 +14,7 @@ struct zcomp_strm {
3501         /* compression/decompression buffer */
3502         void *buffer;
3503         struct crypto_comp *tfm;
3504 +       spinlock_t zcomp_lock;
3505  };
3506
3507  /* dynamic per-device compression frontend */
3508 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3509 index c9914d653968..2038d138f286 100644
3510 --- a/drivers/block/zram/zram_drv.c
3511 +++ b/drivers/block/zram/zram_drv.c
3512 @@ -528,6 +528,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3513                 goto out_error;
3514         }
3515
3516 +       zram_meta_init_table_locks(meta, disksize);
3517 +
3518         return meta;
3519
3520  out_error:
3521 @@ -575,28 +577,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3522         struct zram_meta *meta = zram->meta;
3523         unsigned long handle;
3524         unsigned int size;
3525 +       struct zcomp_strm *zstrm;
3526
3527 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3528 +       zram_lock_table(&meta->table[index]);
3529         handle = meta->table[index].handle;
3530         size = zram_get_obj_size(meta, index);
3531
3532         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3533 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3534 +               zram_unlock_table(&meta->table[index]);
3535                 memset(mem, 0, PAGE_SIZE);
3536                 return 0;
3537         }
3538
3539 +       zstrm = zcomp_stream_get(zram->comp);
3540         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3541         if (size == PAGE_SIZE) {
3542                 memcpy(mem, cmem, PAGE_SIZE);
3543         } else {
3544 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3545 -
3546                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3547 -               zcomp_stream_put(zram->comp);
3548         }
3549         zs_unmap_object(meta->mem_pool, handle);
3550 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3551 +       zcomp_stream_put(zram->comp);
3552 +       zram_unlock_table(&meta->table[index]);
3553
3554         /* Should NEVER happen. Return bio error if it does. */
3555         if (unlikely(ret)) {
3556 @@ -616,14 +618,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3557         struct zram_meta *meta = zram->meta;
3558         page = bvec->bv_page;
3559
3560 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3561 +       zram_lock_table(&meta->table[index]);
3562         if (unlikely(!meta->table[index].handle) ||
3563                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3564 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3565 +               zram_unlock_table(&meta->table[index]);
3566                 handle_zero_page(bvec);
3567                 return 0;
3568         }
3569 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3570 +       zram_unlock_table(&meta->table[index]);
3571
3572         if (is_partial_io(bvec))
3573                 /* Use  a temporary buffer to decompress the page */
3574 @@ -700,10 +702,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3575                 if (user_mem)
3576                         kunmap_atomic(user_mem);
3577                 /* Free memory associated with this sector now. */
3578 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3579 +               zram_lock_table(&meta->table[index]);
3580                 zram_free_page(zram, index);
3581                 zram_set_flag(meta, index, ZRAM_ZERO);
3582 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3583 +               zram_unlock_table(&meta->table[index]);
3584
3585                 atomic64_inc(&zram->stats.zero_pages);
3586                 ret = 0;
3587 @@ -794,12 +796,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3588          * Free memory associated with this sector
3589          * before overwriting unused sectors.
3590          */
3591 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3592 +       zram_lock_table(&meta->table[index]);
3593         zram_free_page(zram, index);
3594
3595         meta->table[index].handle = handle;
3596         zram_set_obj_size(meta, index, clen);
3597 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3598 +       zram_unlock_table(&meta->table[index]);
3599
3600         /* Update stats */
3601         atomic64_add(clen, &zram->stats.compr_data_size);
3602 @@ -842,9 +844,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3603         }
3604
3605         while (n >= PAGE_SIZE) {
3606 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3607 +               zram_lock_table(&meta->table[index]);
3608                 zram_free_page(zram, index);
3609 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3610 +               zram_unlock_table(&meta->table[index]);
3611                 atomic64_inc(&zram->stats.notify_free);
3612                 index++;
3613                 n -= PAGE_SIZE;
3614 @@ -973,9 +975,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3615         zram = bdev->bd_disk->private_data;
3616         meta = zram->meta;
3617
3618 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3619 +       zram_lock_table(&meta->table[index]);
3620         zram_free_page(zram, index);
3621 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3622 +       zram_unlock_table(&meta->table[index]);
3623         atomic64_inc(&zram->stats.notify_free);
3624  }
3625
3626 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3627 index 74fcf10da374..fd4020c99b9e 100644
3628 --- a/drivers/block/zram/zram_drv.h
3629 +++ b/drivers/block/zram/zram_drv.h
3630 @@ -73,6 +73,9 @@ enum zram_pageflags {
3631  struct zram_table_entry {
3632         unsigned long handle;
3633         unsigned long value;
3634 +#ifdef CONFIG_PREEMPT_RT_BASE
3635 +       spinlock_t lock;
3636 +#endif
3637  };
3638
3639  struct zram_stats {
3640 @@ -120,4 +123,42 @@ struct zram {
3641          */
3642         bool claim; /* Protected by bdev->bd_mutex */
3643  };
3644 +
3645 +#ifndef CONFIG_PREEMPT_RT_BASE
3646 +static inline void zram_lock_table(struct zram_table_entry *table)
3647 +{
3648 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3649 +}
3650 +
3651 +static inline void zram_unlock_table(struct zram_table_entry *table)
3652 +{
3653 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3654 +}
3655 +
3656 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3657 +#else /* CONFIG_PREEMPT_RT_BASE */
3658 +static inline void zram_lock_table(struct zram_table_entry *table)
3659 +{
3660 +       spin_lock(&table->lock);
3661 +       __set_bit(ZRAM_ACCESS, &table->value);
3662 +}
3663 +
3664 +static inline void zram_unlock_table(struct zram_table_entry *table)
3665 +{
3666 +       __clear_bit(ZRAM_ACCESS, &table->value);
3667 +       spin_unlock(&table->lock);
3668 +}
3669 +
3670 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3671 +{
3672 +        size_t num_pages = disksize >> PAGE_SHIFT;
3673 +        size_t index;
3674 +
3675 +        for (index = 0; index < num_pages; index++) {
3676 +               spinlock_t *lock = &meta->table[index].lock;
3677 +               spin_lock_init(lock);
3678 +        }
3679 +}
3680 +#endif /* CONFIG_PREEMPT_RT_BASE */
3681 +
3682  #endif
3683 diff --git a/drivers/char/random.c b/drivers/char/random.c
3684 index 08d1dd58c0d2..25ee319dc8e3 100644
3685 --- a/drivers/char/random.c
3686 +++ b/drivers/char/random.c
3687 @@ -262,6 +262,7 @@
3688  #include <linux/syscalls.h>
3689  #include <linux/completion.h>
3690  #include <linux/uuid.h>
3691 +#include <linux/locallock.h>
3692  #include <crypto/chacha20.h>
3693
3694  #include <asm/processor.h>
3695 @@ -1028,8 +1029,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3696         } sample;
3697         long delta, delta2, delta3;
3698
3699 -       preempt_disable();
3700 -
3701         sample.jiffies = jiffies;
3702         sample.cycles = random_get_entropy();
3703         sample.num = num;
3704 @@ -1070,7 +1069,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3705                  */
3706                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3707         }
3708 -       preempt_enable();
3709  }
3710
3711  void add_input_randomness(unsigned int type, unsigned int code,
3712 @@ -1123,28 +1121,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3713         return *(ptr + f->reg_idx++);
3714  }
3715
3716 -void add_interrupt_randomness(int irq, int irq_flags)
3717 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3718  {
3719         struct entropy_store    *r;
3720         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3721 -       struct pt_regs          *regs = get_irq_regs();
3722         unsigned long           now = jiffies;
3723         cycles_t                cycles = random_get_entropy();
3724         __u32                   c_high, j_high;
3725 -       __u64                   ip;
3726         unsigned long           seed;
3727         int                     credit = 0;
3728
3729         if (cycles == 0)
3730 -               cycles = get_reg(fast_pool, regs);
3731 +               cycles = get_reg(fast_pool, NULL);
3732         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3733         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3734         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3735         fast_pool->pool[1] ^= now ^ c_high;
3736 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3737 +       if (!ip)
3738 +               ip = _RET_IP_;
3739         fast_pool->pool[2] ^= ip;
3740         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3741 -               get_reg(fast_pool, regs);
3742 +               get_reg(fast_pool, NULL);
3743
3744         fast_mix(fast_pool);
3745         add_interrupt_bench(cycles);
3746 @@ -2056,6 +2053,7 @@ struct batched_entropy {
3747   * goal of being quite fast and not depleting entropy.
3748   */
3749  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_long);
3750 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_long_lock);
3751  unsigned long get_random_long(void)
3752  {
3753         unsigned long ret;
3754 @@ -2064,13 +2062,13 @@ unsigned long get_random_long(void)
3755         if (arch_get_random_long(&ret))
3756                 return ret;
3757
3758 -       batch = &get_cpu_var(batched_entropy_long);
3759 +       batch = &get_locked_var(batched_entropy_long_lock, batched_entropy_long);
3760         if (batch->position % ARRAY_SIZE(batch->entropy_long) == 0) {
3761                 extract_crng((u8 *)batch->entropy_long);
3762                 batch->position = 0;
3763         }
3764         ret = batch->entropy_long[batch->position++];
3765 -       put_cpu_var(batched_entropy_long);
3766 +       put_locked_var(batched_entropy_long_lock, batched_entropy_long);
3767         return ret;
3768  }
3769  EXPORT_SYMBOL(get_random_long);
3770 @@ -2082,6 +2080,8 @@ unsigned int get_random_int(void)
3771  }
3772  #else
3773  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_int);
3774 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_int_lock);
3775 +
3776  unsigned int get_random_int(void)
3777  {
3778         unsigned int ret;
3779 @@ -2090,13 +2090,13 @@ unsigned int get_random_int(void)
3780         if (arch_get_random_int(&ret))
3781                 return ret;
3782
3783 -       batch = &get_cpu_var(batched_entropy_int);
3784 +       batch = &get_locked_var(batched_entropy_int_lock, batched_entropy_int);
3785         if (batch->position % ARRAY_SIZE(batch->entropy_int) == 0) {
3786                 extract_crng((u8 *)batch->entropy_int);
3787                 batch->position = 0;
3788         }
3789         ret = batch->entropy_int[batch->position++];
3790 -       put_cpu_var(batched_entropy_int);
3791 +       put_locked_var(batched_entropy_int_lock, batched_entropy_int);
3792         return ret;
3793  }
3794  #endif
3795 diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
3796 index 8022bea27fed..247330efd310 100644
3797 --- a/drivers/char/tpm/tpm_tis.c
3798 +++ b/drivers/char/tpm/tpm_tis.c
3799 @@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da
3800         return container_of(data, struct tpm_tis_tcg_phy, priv);
3801  }
3802
3803 +#ifdef CONFIG_PREEMPT_RT_FULL
3804 +/*
3805 + * Flushes previous write operations to chip so that a subsequent
3806 + * ioread*()s won't stall a cpu.
3807 + */
3808 +static inline void tpm_tis_flush(void __iomem *iobase)
3809 +{
3810 +       ioread8(iobase + TPM_ACCESS(0));
3811 +}
3812 +#else
3813 +#define tpm_tis_flush(iobase) do { } while (0)
3814 +#endif
3815 +
3816 +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr)
3817 +{
3818 +       iowrite8(b, iobase + addr);
3819 +       tpm_tis_flush(iobase);
3820 +}
3821 +
3822 +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr)
3823 +{
3824 +       iowrite32(b, iobase + addr);
3825 +       tpm_tis_flush(iobase);
3826 +}
3827 +
3828  static bool interrupts = true;
3829  module_param(interrupts, bool, 0444);
3830  MODULE_PARM_DESC(interrupts, "Enable interrupts");
3831 @@ -103,7 +128,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len,
3832         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
3833
3834         while (len--)
3835 -               iowrite8(*value++, phy->iobase + addr);
3836 +               tpm_tis_iowrite8(*value++, phy->iobase, addr);
3837         return 0;
3838  }
3839
3840 @@ -127,7 +152,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value)
3841  {
3842         struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data);
3843
3844 -       iowrite32(value, phy->iobase + addr);
3845 +       tpm_tis_iowrite32(value, phy->iobase, addr);
3846         return 0;
3847  }
3848
3849 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3850 index 4da2af9694a2..5b6f57f500b8 100644
3851 --- a/drivers/clocksource/tcb_clksrc.c
3852 +++ b/drivers/clocksource/tcb_clksrc.c
3853 @@ -23,8 +23,7 @@
3854   *     this 32 bit free-running counter. the second channel is not used.
3855   *
3856   *   - The third channel may be used to provide a 16-bit clockevent
3857 - *     source, used in either periodic or oneshot mode.  This runs
3858 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3859 + *     source, used in either periodic or oneshot mode.
3860   *
3861   * A boot clocksource and clockevent source are also currently needed,
3862   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3863 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3864  struct tc_clkevt_device {
3865         struct clock_event_device       clkevt;
3866         struct clk                      *clk;
3867 +       bool                            clk_enabled;
3868 +       u32                             freq;
3869         void __iomem                    *regs;
3870  };
3871
3872 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3873         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3874  }
3875
3876 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3877 - * because using one of the divided clocks would usually mean the
3878 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3879 - *
3880 - * A divided clock could be good for high resolution timers, since
3881 - * 30.5 usec resolution can seem "low".
3882 - */
3883  static u32 timer_clock;
3884
3885 +static void tc_clk_disable(struct clock_event_device *d)
3886 +{
3887 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3888 +
3889 +       clk_disable(tcd->clk);
3890 +       tcd->clk_enabled = false;
3891 +}
3892 +
3893 +static void tc_clk_enable(struct clock_event_device *d)
3894 +{
3895 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3896 +
3897 +       if (tcd->clk_enabled)
3898 +               return;
3899 +       clk_enable(tcd->clk);
3900 +       tcd->clk_enabled = true;
3901 +}
3902 +
3903  static int tc_shutdown(struct clock_event_device *d)
3904  {
3905         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3906 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3907
3908         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3909         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3910 +       return 0;
3911 +}
3912 +
3913 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3914 +{
3915 +       tc_shutdown(d);
3916         if (!clockevent_state_detached(d))
3917 -               clk_disable(tcd->clk);
3918 +               tc_clk_disable(d);
3919
3920         return 0;
3921  }
3922 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3923         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3924                 tc_shutdown(d);
3925
3926 -       clk_enable(tcd->clk);
3927 +       tc_clk_enable(d);
3928
3929 -       /* slow clock, count up to RC, then irq and stop */
3930 +       /* count up to RC, then irq and stop */
3931         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3932                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3933         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3934 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3935         /* By not making the gentime core emulate periodic mode on top
3936          * of oneshot, we get lower overhead and improved accuracy.
3937          */
3938 -       clk_enable(tcd->clk);
3939 +       tc_clk_enable(d);
3940
3941 -       /* slow clock, count up to RC, then irq and restart */
3942 +       /* count up to RC, then irq and restart */
3943         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3944                      regs + ATMEL_TC_REG(2, CMR));
3945 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3946 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3947
3948         /* Enable clock and interrupts on RC compare */
3949         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3950 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3951                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3952                                           CLOCK_EVT_FEAT_ONESHOT,
3953                 /* Should be lower than at91rm9200's system timer */
3954 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3955                 .rating                 = 125,
3956 +#else
3957 +               .rating                 = 200,
3958 +#endif
3959                 .set_next_event         = tc_next_event,
3960 -               .set_state_shutdown     = tc_shutdown,
3961 +               .set_state_shutdown     = tc_shutdown_clk_off,
3962                 .set_state_periodic     = tc_set_periodic,
3963                 .set_state_oneshot      = tc_set_oneshot,
3964         },
3965 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3966         return IRQ_NONE;
3967  }
3968
3969 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3970 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3971  {
3972 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3973         int ret;
3974         struct clk *t2_clk = tc->clk[2];
3975         int irq = tc->irq[2];
3976 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3977         clkevt.regs = tc->regs;
3978         clkevt.clk = t2_clk;
3979
3980 -       timer_clock = clk32k_divisor_idx;
3981 +       timer_clock = divisor_idx;
3982 +       if (!divisor)
3983 +               clkevt.freq = 32768;
3984 +       else
3985 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3986
3987         clkevt.clkevt.cpumask = cpumask_of(0);
3988
3989 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3990                 return ret;
3991         }
3992
3993 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3994 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3995
3996         return ret;
3997  }
3998 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
3999                 goto err_disable_t1;
4000
4001         /* channel 2:  periodic and oneshot timer support */
4002 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4003         ret = setup_clkevents(tc, clk32k_divisor_idx);
4004 +#else
4005 +       ret = setup_clkevents(tc, best_divisor_idx);
4006 +#endif
4007         if (ret)
4008                 goto err_unregister_clksrc;
4009
4010 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
4011 index 6555821bbdae..93288849b2bd 100644
4012 --- a/drivers/clocksource/timer-atmel-pit.c
4013 +++ b/drivers/clocksource/timer-atmel-pit.c
4014 @@ -46,6 +46,7 @@ struct pit_data {
4015         u32             cycle;
4016         u32             cnt;
4017         unsigned int    irq;
4018 +       bool            irq_requested;
4019         struct clk      *mck;
4020  };
4021
4022 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
4023
4024         /* disable irq, leaving the clocksource active */
4025         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
4026 +       if (data->irq_requested) {
4027 +               free_irq(data->irq, data);
4028 +               data->irq_requested = false;
4029 +       }
4030         return 0;
4031  }
4032
4033 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
4034  /*
4035   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
4036   */
4037  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
4038  {
4039         struct pit_data *data = clkevt_to_pit_data(dev);
4040 +       int ret;
4041 +
4042 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4043 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4044 +                         "at91_tick", data);
4045 +       if (ret)
4046 +               panic(pr_fmt("Unable to setup IRQ\n"));
4047 +
4048 +       data->irq_requested = true;
4049
4050         /* update clocksource counter */
4051         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
4052 @@ -230,15 +245,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
4053                 return ret;
4054         }
4055
4056 -       /* Set up irq handler */
4057 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4058 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4059 -                         "at91_tick", data);
4060 -       if (ret) {
4061 -               pr_err("Unable to setup IRQ\n");
4062 -               return ret;
4063 -       }
4064 -
4065         /* Set up and register clockevents */
4066         data->clkevt.name = "pit";
4067         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
4068 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
4069 index e90ab5b63a90..9e124087c55f 100644
4070 --- a/drivers/clocksource/timer-atmel-st.c
4071 +++ b/drivers/clocksource/timer-atmel-st.c
4072 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
4073         last_crtr = read_CRTR();
4074  }
4075
4076 +static int atmel_st_irq;
4077 +
4078  static int clkevt32k_shutdown(struct clock_event_device *evt)
4079  {
4080         clkdev32k_disable_and_flush_irq();
4081         irqmask = 0;
4082         regmap_write(regmap_st, AT91_ST_IER, irqmask);
4083 +       free_irq(atmel_st_irq, regmap_st);
4084         return 0;
4085  }
4086
4087  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4088  {
4089 +       int ret;
4090 +
4091         clkdev32k_disable_and_flush_irq();
4092
4093 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4094 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4095 +                         "at91_tick", regmap_st);
4096 +       if (ret)
4097 +               panic(pr_fmt("Unable to setup IRQ\n"));
4098 +
4099         /*
4100          * ALM for oneshot irqs, set by next_event()
4101          * before 32 seconds have passed.
4102 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4103
4104  static int clkevt32k_set_periodic(struct clock_event_device *dev)
4105  {
4106 +       int ret;
4107 +
4108         clkdev32k_disable_and_flush_irq();
4109
4110 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4111 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4112 +                         "at91_tick", regmap_st);
4113 +       if (ret)
4114 +               panic(pr_fmt("Unable to setup IRQ\n"));
4115 +
4116         /* PIT for periodic irqs; fixed rate of 1/HZ */
4117         irqmask = AT91_ST_PITS;
4118         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
4119 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
4120  {
4121         struct clk *sclk;
4122         unsigned int sclk_rate, val;
4123 -       int irq, ret;
4124 +       int ret;
4125
4126         regmap_st = syscon_node_to_regmap(node);
4127         if (IS_ERR(regmap_st)) {
4128 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
4129         regmap_read(regmap_st, AT91_ST_SR, &val);
4130
4131         /* Get the interrupts property */
4132 -       irq  = irq_of_parse_and_map(node, 0);
4133 -       if (!irq) {
4134 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
4135 +       if (!atmel_st_irq) {
4136                 pr_err("Unable to get IRQ from DT\n");
4137                 return -EINVAL;
4138         }
4139
4140 -       /* Make IRQs happen for the system timer */
4141 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
4142 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4143 -                         "at91_tick", regmap_st);
4144 -       if (ret) {
4145 -               pr_err("Unable to setup IRQ\n");
4146 -               return ret;
4147 -       }
4148 -
4149         sclk = of_clk_get(node, 0);
4150         if (IS_ERR(sclk)) {
4151                 pr_err("Unable to get slow clock\n");
4152 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
4153 index a782ce87715c..19d265948526 100644
4154 --- a/drivers/connector/cn_proc.c
4155 +++ b/drivers/connector/cn_proc.c
4156 @@ -32,6 +32,7 @@
4157  #include <linux/pid_namespace.h>
4158
4159  #include <linux/cn_proc.h>
4160 +#include <linux/locallock.h>
4161
4162  /*
4163   * Size of a cn_msg followed by a proc_event structure.  Since the
4164 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4165
4166  /* proc_event_counts is used as the sequence number of the netlink message */
4167  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4168 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4169
4170  static inline void send_msg(struct cn_msg *msg)
4171  {
4172 -       preempt_disable();
4173 +       local_lock(send_msg_lock);
4174
4175         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4176         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4177 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4178          */
4179         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4180
4181 -       preempt_enable();
4182 +       local_unlock(send_msg_lock);
4183  }
4184
4185  void proc_fork_connector(struct task_struct *task)
4186 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4187 index adbd1de1cea5..1fac5074f2cf 100644
4188 --- a/drivers/cpufreq/Kconfig.x86
4189 +++ b/drivers/cpufreq/Kconfig.x86
4190 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4191
4192  config X86_POWERNOW_K8
4193         tristate "AMD Opteron/Athlon64 PowerNow!"
4194 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4195 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4196         help
4197           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4198           Support for K10 and newer processors is now in acpi-cpufreq.
4199 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4200 index 2117f172d7a2..96c15501b0c8 100644
4201 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4202 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4203 @@ -1489,7 +1489,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
4204         if (ret)
4205                 return ret;
4206
4207 +#ifndef CONFIG_PREEMPT_RT_BASE
4208         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4209 +#endif
4210
4211         i915_gem_execbuffer_move_to_active(vmas, params->request);
4212
4213 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4214 index 755d78832a66..97fb03dc4971 100644
4215 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4216 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4217 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4218         if (!mutex_is_locked(mutex))
4219                 return false;
4220
4221 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4222 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4223         return mutex->owner == task;
4224  #else
4225         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4226 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4227 index 02908e37c228..05c0480576e1 100644
4228 --- a/drivers/gpu/drm/i915/i915_irq.c
4229 +++ b/drivers/gpu/drm/i915/i915_irq.c
4230 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4231         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4232
4233         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4234 +       preempt_disable_rt();
4235
4236         /* Get optional system timestamp before query. */
4237         if (stime)
4238 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4239                 *etime = ktime_get();
4240
4241         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4242 +       preempt_enable_rt();
4243
4244         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4245
4246 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4247 index ce32303b3013..c0a53bf2e952 100644
4248 --- a/drivers/gpu/drm/i915/intel_display.c
4249 +++ b/drivers/gpu/drm/i915/intel_display.c
4250 @@ -12138,7 +12138,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4251         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4252         struct intel_flip_work *work;
4253
4254 -       WARN_ON(!in_interrupt());
4255 +       WARN_ON_NONRT(!in_interrupt());
4256
4257         if (crtc == NULL)
4258                 return;
4259 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4260 index 64f4e2e18594..aebf1e9eabcb 100644
4261 --- a/drivers/gpu/drm/i915/intel_sprite.c
4262 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4263 @@ -35,6 +35,7 @@
4264  #include <drm/drm_rect.h>
4265  #include <drm/drm_atomic.h>
4266  #include <drm/drm_plane_helper.h>
4267 +#include <linux/locallock.h>
4268  #include "intel_drv.h"
4269  #include "intel_frontbuffer.h"
4270  #include <drm/i915_drm.h>
4271 @@ -65,6 +66,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4272                             1000 * adjusted_mode->crtc_htotal);
4273  }
4274
4275 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4276 +
4277  /**
4278   * intel_pipe_update_start() - start update of a set of display registers
4279   * @crtc: the crtc of which the registers are going to be updated
4280 @@ -98,7 +101,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4281         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4282         max = vblank_start - 1;
4283
4284 -       local_irq_disable();
4285 +       local_lock_irq(pipe_update_lock);
4286
4287         if (min <= 0 || max <= 0)
4288                 return;
4289 @@ -128,11 +131,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4290                         break;
4291                 }
4292
4293 -               local_irq_enable();
4294 +               local_unlock_irq(pipe_update_lock);
4295
4296                 timeout = schedule_timeout(timeout);
4297
4298 -               local_irq_disable();
4299 +               local_lock_irq(pipe_update_lock);
4300         }
4301
4302         finish_wait(wq, &wait);
4303 @@ -202,7 +205,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4304                 crtc->base.state->event = NULL;
4305         }
4306
4307 -       local_irq_enable();
4308 +       local_unlock_irq(pipe_update_lock);
4309
4310         if (crtc->debug.start_vbl_count &&
4311             crtc->debug.start_vbl_count != end_vbl_count) {
4312 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4313 index 192b2d3a79cb..d5372a207326 100644
4314 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4315 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4316 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4317         if (!mutex_is_locked(mutex))
4318                 return false;
4319
4320 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4321 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4322         return mutex->owner == task;
4323  #else
4324         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4325 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4326 index cdb8cb568c15..b6d7fd964cbc 100644
4327 --- a/drivers/gpu/drm/radeon/radeon_display.c
4328 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4329 @@ -1845,6 +1845,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4330         struct radeon_device *rdev = dev->dev_private;
4331
4332         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4333 +       preempt_disable_rt();
4334
4335         /* Get optional system timestamp before query. */
4336         if (stime)
4337 @@ -1937,6 +1938,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4338                 *etime = ktime_get();
4339
4340         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4341 +       preempt_enable_rt();
4342
4343         /* Decode into vertical and horizontal scanout position. */
4344         *vpos = position & 0x1fff;
4345 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4346 index 0276d2ef06ee..8868045eabde 100644
4347 --- a/drivers/hv/vmbus_drv.c
4348 +++ b/drivers/hv/vmbus_drv.c
4349 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4350         void *page_addr;
4351         struct hv_message *msg;
4352         union hv_synic_event_flags *event;
4353 +       struct pt_regs *regs = get_irq_regs();
4354 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4355         bool handled = false;
4356
4357         page_addr = hv_context.synic_event_page[cpu];
4358 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4359                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4360         }
4361
4362 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4363 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4364  }
4365
4366
4367 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4368 index 36f76e28a0bf..394f142f90c7 100644
4369 --- a/drivers/ide/alim15x3.c
4370 +++ b/drivers/ide/alim15x3.c
4371 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4372
4373         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4374
4375 -       local_irq_save(flags);
4376 +       local_irq_save_nort(flags);
4377
4378         if (m5229_revision < 0xC2) {
4379                 /*
4380 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4381         }
4382         pci_dev_put(north);
4383         pci_dev_put(isa_dev);
4384 -       local_irq_restore(flags);
4385 +       local_irq_restore_nort(flags);
4386         return 0;
4387  }
4388
4389 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4390 index 0ceae5cbd89a..c212e85d7f3e 100644
4391 --- a/drivers/ide/hpt366.c
4392 +++ b/drivers/ide/hpt366.c
4393 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4394
4395         dma_old = inb(base + 2);
4396
4397 -       local_irq_save(flags);
4398 +       local_irq_save_nort(flags);
4399
4400         dma_new = dma_old;
4401         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4402 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4403         if (dma_new != dma_old)
4404                 outb(dma_new, base + 2);
4405
4406 -       local_irq_restore(flags);
4407 +       local_irq_restore_nort(flags);
4408
4409         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4410                          hwif->name, base, base + 7);
4411 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4412 index 19763977568c..4169433faab5 100644
4413 --- a/drivers/ide/ide-io-std.c
4414 +++ b/drivers/ide/ide-io-std.c
4415 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4416                 unsigned long uninitialized_var(flags);
4417
4418                 if ((io_32bit & 2) && !mmio) {
4419 -                       local_irq_save(flags);
4420 +                       local_irq_save_nort(flags);
4421                         ata_vlb_sync(io_ports->nsect_addr);
4422                 }
4423
4424 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4425                         insl(data_addr, buf, words);
4426
4427                 if ((io_32bit & 2) && !mmio)
4428 -                       local_irq_restore(flags);
4429 +                       local_irq_restore_nort(flags);
4430
4431                 if (((len + 1) & 3) < 2)
4432                         return;
4433 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4434                 unsigned long uninitialized_var(flags);
4435
4436                 if ((io_32bit & 2) && !mmio) {
4437 -                       local_irq_save(flags);
4438 +                       local_irq_save_nort(flags);
4439                         ata_vlb_sync(io_ports->nsect_addr);
4440                 }
4441
4442 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4443                         outsl(data_addr, buf, words);
4444
4445                 if ((io_32bit & 2) && !mmio)
4446 -                       local_irq_restore(flags);
4447 +                       local_irq_restore_nort(flags);
4448
4449                 if (((len + 1) & 3) < 2)
4450                         return;
4451 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4452 index 669ea1e45795..e12e43e62245 100644
4453 --- a/drivers/ide/ide-io.c
4454 +++ b/drivers/ide/ide-io.c
4455 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4456                 /* disable_irq_nosync ?? */
4457                 disable_irq(hwif->irq);
4458                 /* local CPU only, as if we were handling an interrupt */
4459 -               local_irq_disable();
4460 +               local_irq_disable_nort();
4461                 if (hwif->polling) {
4462                         startstop = handler(drive);
4463                 } else if (drive_is_ready(drive)) {
4464 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4465 index 376f2dc410c5..f014dd1b73dc 100644
4466 --- a/drivers/ide/ide-iops.c
4467 +++ b/drivers/ide/ide-iops.c
4468 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4469                                 if ((stat & ATA_BUSY) == 0)
4470                                         break;
4471
4472 -                               local_irq_restore(flags);
4473 +                               local_irq_restore_nort(flags);
4474                                 *rstat = stat;
4475                                 return -EBUSY;
4476                         }
4477                 }
4478 -               local_irq_restore(flags);
4479 +               local_irq_restore_nort(flags);
4480         }
4481         /*
4482          * Allow status to settle, then read it again.
4483 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4484 index 0b63facd1d87..4ceba37afc0c 100644
4485 --- a/drivers/ide/ide-probe.c
4486 +++ b/drivers/ide/ide-probe.c
4487 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4488         int bswap = 1;
4489
4490         /* local CPU only; some systems need this */
4491 -       local_irq_save(flags);
4492 +       local_irq_save_nort(flags);
4493         /* read 512 bytes of id info */
4494         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4495 -       local_irq_restore(flags);
4496 +       local_irq_restore_nort(flags);
4497
4498         drive->dev_flags |= IDE_DFLAG_ID_READ;
4499  #ifdef DEBUG
4500 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4501 index a716693417a3..be0568c722d6 100644
4502 --- a/drivers/ide/ide-taskfile.c
4503 +++ b/drivers/ide/ide-taskfile.c
4504 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4505
4506                 page_is_high = PageHighMem(page);
4507                 if (page_is_high)
4508 -                       local_irq_save(flags);
4509 +                       local_irq_save_nort(flags);
4510
4511                 buf = kmap_atomic(page) + offset;
4512
4513 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4514                 kunmap_atomic(buf);
4515
4516                 if (page_is_high)
4517 -                       local_irq_restore(flags);
4518 +                       local_irq_restore_nort(flags);
4519
4520                 len -= nr_bytes;
4521         }
4522 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4523         }
4524
4525         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4526 -               local_irq_disable();
4527 +               local_irq_disable_nort();
4528
4529         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4530
4531 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4532 index fddff403d5d2..cca1bb4fbfe3 100644
4533 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4534 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4535 @@ -902,7 +902,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4536
4537         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4538
4539 -       local_irq_save(flags);
4540 +       local_irq_save_nort(flags);
4541         netif_addr_lock(dev);
4542         spin_lock(&priv->lock);
4543
4544 @@ -984,7 +984,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4545
4546         spin_unlock(&priv->lock);
4547         netif_addr_unlock(dev);
4548 -       local_irq_restore(flags);
4549 +       local_irq_restore_nort(flags);
4550
4551         /*
4552          * make sure the in-flight joins have finished before we attempt
4553 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4554 index 4a2a9e370be7..e970d9afd179 100644
4555 --- a/drivers/input/gameport/gameport.c
4556 +++ b/drivers/input/gameport/gameport.c
4557 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4558         tx = ~0;
4559
4560         for (i = 0; i < 50; i++) {
4561 -               local_irq_save(flags);
4562 +               local_irq_save_nort(flags);
4563                 t1 = ktime_get_ns();
4564                 for (t = 0; t < 50; t++)
4565                         gameport_read(gameport);
4566                 t2 = ktime_get_ns();
4567                 t3 = ktime_get_ns();
4568 -               local_irq_restore(flags);
4569 +               local_irq_restore_nort(flags);
4570                 udelay(i * 10);
4571                 t = (t2 - t1) - (t3 - t2);
4572                 if (t < tx)
4573 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4574         tx = 1 << 30;
4575
4576         for(i = 0; i < 50; i++) {
4577 -               local_irq_save(flags);
4578 +               local_irq_save_nort(flags);
4579                 GET_TIME(t1);
4580                 for (t = 0; t < 50; t++) gameport_read(gameport);
4581                 GET_TIME(t2);
4582                 GET_TIME(t3);
4583 -               local_irq_restore(flags);
4584 +               local_irq_restore_nort(flags);
4585                 udelay(i * 10);
4586                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4587         }
4588 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4589         tx = 1 << 30;
4590
4591         for(i = 0; i < 50; i++) {
4592 -               local_irq_save(flags);
4593 +               local_irq_save_nort(flags);
4594                 t1 = rdtsc();
4595                 for (t = 0; t < 50; t++) gameport_read(gameport);
4596                 t2 = rdtsc();
4597 -               local_irq_restore(flags);
4598 +               local_irq_restore_nort(flags);
4599                 udelay(i * 10);
4600                 if (t2 - t1 < tx) tx = t2 - t1;
4601         }
4602 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4603 index 1a0b110f12c0..ff5c2424eb9e 100644
4604 --- a/drivers/iommu/amd_iommu.c
4605 +++ b/drivers/iommu/amd_iommu.c
4606 @@ -1923,10 +1923,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4607         int ret;
4608
4609         /*
4610 -        * Must be called with IRQs disabled. Warn here to detect early
4611 -        * when its not.
4612 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4613 +        * detect early when its not.
4614          */
4615 -       WARN_ON(!irqs_disabled());
4616 +       WARN_ON_NONRT(!irqs_disabled());
4617
4618         /* lock domain */
4619         spin_lock(&domain->lock);
4620 @@ -2094,10 +2094,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4621         struct protection_domain *domain;
4622
4623         /*
4624 -        * Must be called with IRQs disabled. Warn here to detect early
4625 -        * when its not.
4626 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4627 +        * detect early when its not.
4628          */
4629 -       WARN_ON(!irqs_disabled());
4630 +       WARN_ON_NONRT(!irqs_disabled());
4631
4632         if (WARN_ON(!dev_data->domain))
4633                 return;
4634 @@ -2283,7 +2283,7 @@ static void queue_add(struct dma_ops_domain *dma_dom,
4635         pages     = __roundup_pow_of_two(pages);
4636         address >>= PAGE_SHIFT;
4637
4638 -       queue = get_cpu_ptr(&flush_queue);
4639 +       queue = raw_cpu_ptr(&flush_queue);
4640         spin_lock_irqsave(&queue->lock, flags);
4641
4642         if (queue->next == FLUSH_QUEUE_SIZE)
4643 @@ -2300,8 +2300,6 @@ static void queue_add(struct dma_ops_domain *dma_dom,
4644
4645         if (atomic_cmpxchg(&queue_timer_on, 0, 1) == 0)
4646                 mod_timer(&queue_timer, jiffies + msecs_to_jiffies(10));
4647 -
4648 -       put_cpu_ptr(&flush_queue);
4649  }
4650
4651
4652 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4653 index 002f8a421efa..980f41f1a194 100644
4654 --- a/drivers/iommu/intel-iommu.c
4655 +++ b/drivers/iommu/intel-iommu.c
4656 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4657         struct deferred_flush_table *tables;
4658  };
4659
4660 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4661 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4662
4663  /* bitmap for indexing intel_iommus */
4664  static int g_num_of_iommus;
4665 @@ -3719,10 +3719,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4666         struct intel_iommu *iommu;
4667         struct deferred_flush_entry *entry;
4668         struct deferred_flush_data *flush_data;
4669 -       unsigned int cpuid;
4670
4671 -       cpuid = get_cpu();
4672 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4673 +       flush_data = raw_cpu_ptr(&deferred_flush);
4674
4675         /* Flush all CPUs' entries to avoid deferring too much.  If
4676          * this becomes a bottleneck, can just flush us, and rely on
4677 @@ -3755,8 +3753,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4678         }
4679         flush_data->size++;
4680         spin_unlock_irqrestore(&flush_data->lock, flags);
4681 -
4682 -       put_cpu();
4683  }
4684
4685  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4686 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4687 index e23001bfcfee..359d5d169ec0 100644
4688 --- a/drivers/iommu/iova.c
4689 +++ b/drivers/iommu/iova.c
4690 @@ -22,6 +22,7 @@
4691  #include <linux/slab.h>
4692  #include <linux/smp.h>
4693  #include <linux/bitops.h>
4694 +#include <linux/cpu.h>
4695
4696  static bool iova_rcache_insert(struct iova_domain *iovad,
4697                                unsigned long pfn,
4698 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4699
4700                 /* Try replenishing IOVAs by flushing rcache. */
4701                 flushed_rcache = true;
4702 -               preempt_disable();
4703                 for_each_online_cpu(cpu)
4704                         free_cpu_cached_iovas(cpu, iovad);
4705 -               preempt_enable();
4706                 goto retry;
4707         }
4708
4709 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4710         bool can_insert = false;
4711         unsigned long flags;
4712
4713 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4714 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4715         spin_lock_irqsave(&cpu_rcache->lock, flags);
4716
4717         if (!iova_magazine_full(cpu_rcache->loaded)) {
4718 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4719                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4720
4721         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4722 -       put_cpu_ptr(rcache->cpu_rcaches);
4723
4724         if (mag_to_free) {
4725                 iova_magazine_free_pfns(mag_to_free, iovad);
4726 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4727         bool has_pfn = false;
4728         unsigned long flags;
4729
4730 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4731 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4732         spin_lock_irqsave(&cpu_rcache->lock, flags);
4733
4734         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4735 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4736                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4737
4738         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4739 -       put_cpu_ptr(rcache->cpu_rcaches);
4740
4741         return iova_pfn;
4742  }
4743 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4744 index 3f9ddb9fafa7..09da5b6b44a1 100644
4745 --- a/drivers/leds/trigger/Kconfig
4746 +++ b/drivers/leds/trigger/Kconfig
4747 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4748
4749  config LEDS_TRIGGER_CPU
4750         bool "LED CPU Trigger"
4751 -       depends on LEDS_TRIGGERS
4752 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4753         help
4754           This allows LEDs to be controlled by active CPUs. This shows
4755           the active CPUs across an array of LEDs so you can see which
4756 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4757 index 4d200883c505..98b64ed5cb81 100644
4758 --- a/drivers/md/bcache/Kconfig
4759 +++ b/drivers/md/bcache/Kconfig
4760 @@ -1,6 +1,7 @@
4761
4762  config BCACHE
4763         tristate "Block device as cache"
4764 +       depends on !PREEMPT_RT_FULL
4765         ---help---
4766         Allows a block device to be used as cache for other devices; uses
4767         a btree for indexing and the layout is optimized for SSDs.
4768 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4769 index ba7c4c685db3..834ec328f217 100644
4770 --- a/drivers/md/dm-rq.c
4771 +++ b/drivers/md/dm-rq.c
4772 @@ -842,7 +842,7 @@ static void dm_old_request_fn(struct request_queue *q)
4773                 /* Establish tio->ti before queuing work (map_tio_request) */
4774                 tio->ti = ti;
4775                 kthread_queue_work(&md->kworker, &tio->work);
4776 -               BUG_ON(!irqs_disabled());
4777 +               BUG_ON_NONRT(!irqs_disabled());
4778         }
4779  }
4780
4781 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4782 index 7aea0221530c..4dde911925dc 100644
4783 --- a/drivers/md/raid5.c
4784 +++ b/drivers/md/raid5.c
4785 @@ -429,7 +429,7 @@ void raid5_release_stripe(struct stripe_head *sh)
4786                 md_wakeup_thread(conf->mddev->thread);
4787         return;
4788  slow_path:
4789 -       local_irq_save(flags);
4790 +       local_irq_save_nort(flags);
4791         /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
4792         if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
4793                 INIT_LIST_HEAD(&list);
4794 @@ -438,7 +438,7 @@ void raid5_release_stripe(struct stripe_head *sh)
4795                 spin_unlock(&conf->device_lock);
4796                 release_inactive_stripe_list(conf, &list, hash);
4797         }
4798 -       local_irq_restore(flags);
4799 +       local_irq_restore_nort(flags);
4800  }
4801
4802  static inline void remove_hash(struct stripe_head *sh)
4803 @@ -1934,8 +1934,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4804         struct raid5_percpu *percpu;
4805         unsigned long cpu;
4806
4807 -       cpu = get_cpu();
4808 +       cpu = get_cpu_light();
4809         percpu = per_cpu_ptr(conf->percpu, cpu);
4810 +       spin_lock(&percpu->lock);
4811         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4812                 ops_run_biofill(sh);
4813                 overlap_clear++;
4814 @@ -1991,7 +1992,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4815                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4816                                 wake_up(&sh->raid_conf->wait_for_overlap);
4817                 }
4818 -       put_cpu();
4819 +       spin_unlock(&percpu->lock);
4820 +       put_cpu_light();
4821  }
4822
4823  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4824 @@ -6407,6 +6409,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
4825                        __func__, cpu);
4826                 return -ENOMEM;
4827         }
4828 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4829         return 0;
4830  }
4831
4832 @@ -6417,7 +6420,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4833         conf->percpu = alloc_percpu(struct raid5_percpu);
4834         if (!conf->percpu)
4835                 return -ENOMEM;
4836 -
4837         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4838         if (!err) {
4839                 conf->scribble_disks = max(conf->raid_disks,
4840 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4841 index 57ec49f0839e..0739604990b7 100644
4842 --- a/drivers/md/raid5.h
4843 +++ b/drivers/md/raid5.h
4844 @@ -504,6 +504,7 @@ struct r5conf {
4845         int                     recovery_disabled;
4846         /* per cpu variables */
4847         struct raid5_percpu {
4848 +               spinlock_t      lock;           /* Protection for -RT */
4849                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4850                 struct flex_array *scribble;   /* space for constructing buffer
4851                                               * lists and performing address
4852 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4853 index 64971baf11fa..215e91e36198 100644
4854 --- a/drivers/misc/Kconfig
4855 +++ b/drivers/misc/Kconfig
4856 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4857  config ATMEL_TCLIB
4858         bool "Atmel AT32/AT91 Timer/Counter Library"
4859         depends on (AVR32 || ARCH_AT91)
4860 +       default y if PREEMPT_RT_FULL
4861         help
4862           Select this if you want a library to allocate the Timer/Counter
4863           blocks found on many Atmel processors.  This facilitates using
4864 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4865           are combined to make a single 32-bit timer.
4866
4867           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4868 -         may be used as a clock event device supporting oneshot mode
4869 -         (delays of up to two seconds) based on the 32 KiHz clock.
4870 +         may be used as a clock event device supporting oneshot mode.
4871
4872  config ATMEL_TCB_CLKSRC_BLOCK
4873         int
4874 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4875           TC can be used for other purposes, such as PWM generation and
4876           interval timing.
4877
4878 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4879 +       bool "TC Block use 32 KiHz clock"
4880 +       depends on ATMEL_TCB_CLKSRC
4881 +       default y if !PREEMPT_RT_FULL
4882 +       help
4883 +         Select this to use 32 KiHz base clock rate as TC block clock
4884 +         source for clock events.
4885 +
4886 +
4887  config DUMMY_IRQ
4888         tristate "Dummy IRQ handler"
4889         default n
4890 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
4891 index df990bb8c873..1a162709a85e 100644
4892 --- a/drivers/mmc/host/mmci.c
4893 +++ b/drivers/mmc/host/mmci.c
4894 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4895         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4896         struct variant_data *variant = host->variant;
4897         void __iomem *base = host->base;
4898 -       unsigned long flags;
4899         u32 status;
4900
4901         status = readl(base + MMCISTATUS);
4902
4903         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4904
4905 -       local_irq_save(flags);
4906 -
4907         do {
4908                 unsigned int remain, len;
4909                 char *buffer;
4910 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4911
4912         sg_miter_stop(sg_miter);
4913
4914 -       local_irq_restore(flags);
4915 -
4916         /*
4917          * If we have less than the fifo 'half-full' threshold to transfer,
4918          * trigger a PIO interrupt as soon as any data is available.
4919 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
4920 index 9133e7926da5..63afb921ed40 100644
4921 --- a/drivers/net/ethernet/3com/3c59x.c
4922 +++ b/drivers/net/ethernet/3com/3c59x.c
4923 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
4924  {
4925         struct vortex_private *vp = netdev_priv(dev);
4926         unsigned long flags;
4927 -       local_irq_save(flags);
4928 +       local_irq_save_nort(flags);
4929         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4930 -       local_irq_restore(flags);
4931 +       local_irq_restore_nort(flags);
4932  }
4933  #endif
4934
4935 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
4936                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4937                          */
4938                         unsigned long flags;
4939 -                       local_irq_save(flags);
4940 +                       local_irq_save_nort(flags);
4941                         if (vp->full_bus_master_tx)
4942                                 boomerang_interrupt(dev->irq, dev);
4943                         else
4944                                 vortex_interrupt(dev->irq, dev);
4945 -                       local_irq_restore(flags);
4946 +                       local_irq_restore_nort(flags);
4947                 }
4948         }
4949
4950 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
4951 index da4c2d8a4173..1420dfb56bac 100644
4952 --- a/drivers/net/ethernet/realtek/8139too.c
4953 +++ b/drivers/net/ethernet/realtek/8139too.c
4954 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
4955         struct rtl8139_private *tp = netdev_priv(dev);
4956         const int irq = tp->pci_dev->irq;
4957
4958 -       disable_irq(irq);
4959 +       disable_irq_nosync(irq);
4960         rtl8139_interrupt(irq, dev);
4961         enable_irq(irq);
4962  }
4963 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4964 index bca6935a94db..d7a35ee34d03 100644
4965 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4966 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4967 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
4968                         while (!ctx->done.done && msecs--)
4969                                 udelay(1000);
4970                 } else {
4971 -                       wait_event_interruptible(ctx->done.wait,
4972 +                       swait_event_interruptible(ctx->done.wait,
4973                                                  ctx->done.done);
4974                 }
4975                 break;
4976 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
4977 index bedce3453dd3..faf038978650 100644
4978 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
4979 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
4980 @@ -61,7 +61,7 @@ struct msm_pinctrl {
4981         struct notifier_block restart_nb;
4982         int irq;
4983
4984 -       spinlock_t lock;
4985 +       raw_spinlock_t lock;
4986
4987         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
4988         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
4989 @@ -153,14 +153,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
4990         if (WARN_ON(i == g->nfuncs))
4991                 return -EINVAL;
4992
4993 -       spin_lock_irqsave(&pctrl->lock, flags);
4994 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4995
4996         val = readl(pctrl->regs + g->ctl_reg);
4997         val &= ~mask;
4998         val |= i << g->mux_bit;
4999         writel(val, pctrl->regs + g->ctl_reg);
5000
5001 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5002 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5003
5004         return 0;
5005  }
5006 @@ -323,14 +323,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
5007                         break;
5008                 case PIN_CONFIG_OUTPUT:
5009                         /* set output value */
5010 -                       spin_lock_irqsave(&pctrl->lock, flags);
5011 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
5012                         val = readl(pctrl->regs + g->io_reg);
5013                         if (arg)
5014                                 val |= BIT(g->out_bit);
5015                         else
5016                                 val &= ~BIT(g->out_bit);
5017                         writel(val, pctrl->regs + g->io_reg);
5018 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
5019 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5020
5021                         /* enable output */
5022                         arg = 1;
5023 @@ -351,12 +351,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
5024                         return -EINVAL;
5025                 }
5026
5027 -               spin_lock_irqsave(&pctrl->lock, flags);
5028 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
5029                 val = readl(pctrl->regs + g->ctl_reg);
5030                 val &= ~(mask << bit);
5031                 val |= arg << bit;
5032                 writel(val, pctrl->regs + g->ctl_reg);
5033 -               spin_unlock_irqrestore(&pctrl->lock, flags);
5034 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5035         }
5036
5037         return 0;
5038 @@ -384,13 +384,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
5039
5040         g = &pctrl->soc->groups[offset];
5041
5042 -       spin_lock_irqsave(&pctrl->lock, flags);
5043 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5044
5045         val = readl(pctrl->regs + g->ctl_reg);
5046         val &= ~BIT(g->oe_bit);
5047         writel(val, pctrl->regs + g->ctl_reg);
5048
5049 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5050 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5051
5052         return 0;
5053  }
5054 @@ -404,7 +404,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
5055
5056         g = &pctrl->soc->groups[offset];
5057
5058 -       spin_lock_irqsave(&pctrl->lock, flags);
5059 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5060
5061         val = readl(pctrl->regs + g->io_reg);
5062         if (value)
5063 @@ -417,7 +417,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
5064         val |= BIT(g->oe_bit);
5065         writel(val, pctrl->regs + g->ctl_reg);
5066
5067 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5068 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5069
5070         return 0;
5071  }
5072 @@ -443,7 +443,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
5073
5074         g = &pctrl->soc->groups[offset];
5075
5076 -       spin_lock_irqsave(&pctrl->lock, flags);
5077 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5078
5079         val = readl(pctrl->regs + g->io_reg);
5080         if (value)
5081 @@ -452,7 +452,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
5082                 val &= ~BIT(g->out_bit);
5083         writel(val, pctrl->regs + g->io_reg);
5084
5085 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5086 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5087  }
5088
5089  #ifdef CONFIG_DEBUG_FS
5090 @@ -571,7 +571,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
5091
5092         g = &pctrl->soc->groups[d->hwirq];
5093
5094 -       spin_lock_irqsave(&pctrl->lock, flags);
5095 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5096
5097         val = readl(pctrl->regs + g->intr_cfg_reg);
5098         val &= ~BIT(g->intr_enable_bit);
5099 @@ -579,7 +579,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
5100
5101         clear_bit(d->hwirq, pctrl->enabled_irqs);
5102
5103 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5104 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5105  }
5106
5107  static void msm_gpio_irq_unmask(struct irq_data *d)
5108 @@ -592,7 +592,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
5109
5110         g = &pctrl->soc->groups[d->hwirq];
5111
5112 -       spin_lock_irqsave(&pctrl->lock, flags);
5113 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5114
5115         val = readl(pctrl->regs + g->intr_cfg_reg);
5116         val |= BIT(g->intr_enable_bit);
5117 @@ -600,7 +600,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
5118
5119         set_bit(d->hwirq, pctrl->enabled_irqs);
5120
5121 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5122 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5123  }
5124
5125  static void msm_gpio_irq_ack(struct irq_data *d)
5126 @@ -613,7 +613,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
5127
5128         g = &pctrl->soc->groups[d->hwirq];
5129
5130 -       spin_lock_irqsave(&pctrl->lock, flags);
5131 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5132
5133         val = readl(pctrl->regs + g->intr_status_reg);
5134         if (g->intr_ack_high)
5135 @@ -625,7 +625,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
5136         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
5137                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
5138
5139 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5140 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5141  }
5142
5143  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5144 @@ -638,7 +638,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5145
5146         g = &pctrl->soc->groups[d->hwirq];
5147
5148 -       spin_lock_irqsave(&pctrl->lock, flags);
5149 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5150
5151         /*
5152          * For hw without possibility of detecting both edges
5153 @@ -712,7 +712,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5154         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
5155                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
5156
5157 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5158 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5159
5160         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
5161                 irq_set_handler_locked(d, handle_level_irq);
5162 @@ -728,11 +728,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
5163         struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
5164         unsigned long flags;
5165
5166 -       spin_lock_irqsave(&pctrl->lock, flags);
5167 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5168
5169         irq_set_irq_wake(pctrl->irq, on);
5170
5171 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5172 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5173
5174         return 0;
5175  }
5176 @@ -878,7 +878,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
5177         pctrl->soc = soc_data;
5178         pctrl->chip = msm_gpio_template;
5179
5180 -       spin_lock_init(&pctrl->lock);
5181 +       raw_spin_lock_init(&pctrl->lock);
5182
5183         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
5184         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
5185 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
5186 index 9bd41a35a78a..8e2d436c2e3f 100644
5187 --- a/drivers/scsi/fcoe/fcoe.c
5188 +++ b/drivers/scsi/fcoe/fcoe.c
5189 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
5190  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
5191  {
5192         struct fcoe_percpu_s *fps;
5193 -       int rc;
5194 +       int rc, cpu = get_cpu_light();
5195
5196 -       fps = &get_cpu_var(fcoe_percpu);
5197 +       fps = &per_cpu(fcoe_percpu, cpu);
5198         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
5199 -       put_cpu_var(fcoe_percpu);
5200 +       put_cpu_light();
5201
5202         return rc;
5203  }
5204 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
5205                 return 0;
5206         }
5207
5208 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5209 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5210         stats->InvalidCRCCount++;
5211         if (stats->InvalidCRCCount < 5)
5212                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
5213 -       put_cpu();
5214 +       put_cpu_light();
5215         return -EINVAL;
5216  }
5217
5218 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5219          */
5220         hp = (struct fcoe_hdr *) skb_network_header(skb);
5221
5222 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5223 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5224         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
5225                 if (stats->ErrorFrames < 5)
5226                         printk(KERN_WARNING "fcoe: FCoE version "
5227 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5228                 goto drop;
5229
5230         if (!fcoe_filter_frames(lport, fp)) {
5231 -               put_cpu();
5232 +               put_cpu_light();
5233                 fc_exch_recv(lport, fp);
5234                 return;
5235         }
5236  drop:
5237         stats->ErrorFrames++;
5238 -       put_cpu();
5239 +       put_cpu_light();
5240         kfree_skb(skb);
5241  }
5242
5243 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
5244 index dcf36537a767..1a1f2e46452c 100644
5245 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
5246 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
5247 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5248
5249         INIT_LIST_HEAD(&del_list);
5250
5251 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
5252 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
5253
5254         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
5255                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
5256 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5257                                 sel_time = fcf->time;
5258                 }
5259         }
5260 -       put_cpu();
5261 +       put_cpu_light();
5262
5263         list_for_each_entry_safe(fcf, next, &del_list, list) {
5264                 /* Removes fcf from current list */
5265 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
5266 index 16ca31ad5ec0..c3987347e762 100644
5267 --- a/drivers/scsi/libfc/fc_exch.c
5268 +++ b/drivers/scsi/libfc/fc_exch.c
5269 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
5270         }
5271         memset(ep, 0, sizeof(*ep));
5272
5273 -       cpu = get_cpu();
5274 +       cpu = get_cpu_light();
5275         pool = per_cpu_ptr(mp->pool, cpu);
5276         spin_lock_bh(&pool->lock);
5277 -       put_cpu();
5278 +       put_cpu_light();
5279
5280         /* peek cache of free slot */
5281         if (pool->left != FC_XID_UNKNOWN) {
5282 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
5283 index 87f5e694dbed..23c0a50fb6aa 100644
5284 --- a/drivers/scsi/libsas/sas_ata.c
5285 +++ b/drivers/scsi/libsas/sas_ata.c
5286 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5287         /* TODO: audit callers to ensure they are ready for qc_issue to
5288          * unconditionally re-enable interrupts
5289          */
5290 -       local_irq_save(flags);
5291 +       local_irq_save_nort(flags);
5292         spin_unlock(ap->lock);
5293
5294         /* If the device fell off, no sense in issuing commands */
5295 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5296
5297   out:
5298         spin_lock(ap->lock);
5299 -       local_irq_restore(flags);
5300 +       local_irq_restore_nort(flags);
5301         return ret;
5302  }
5303
5304 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
5305 index edc48f3b8230..ee5c6f9dfb6f 100644
5306 --- a/drivers/scsi/qla2xxx/qla_inline.h
5307 +++ b/drivers/scsi/qla2xxx/qla_inline.h
5308 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
5309  {
5310         unsigned long flags;
5311         struct qla_hw_data *ha = rsp->hw;
5312 -       local_irq_save(flags);
5313 +       local_irq_save_nort(flags);
5314         if (IS_P3P_TYPE(ha))
5315                 qla82xx_poll(0, rsp);
5316         else
5317                 ha->isp_ops->intr_handler(0, rsp);
5318 -       local_irq_restore(flags);
5319 +       local_irq_restore_nort(flags);
5320  }
5321
5322  static inline uint8_t *
5323 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
5324 index bddaabb288d4..8de0ec4222fe 100644
5325 --- a/drivers/scsi/qla2xxx/qla_isr.c
5326 +++ b/drivers/scsi/qla2xxx/qla_isr.c
5327 @@ -3129,7 +3129,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
5328                 * kref_put().
5329                 */
5330                 kref_get(&qentry->irq_notify.kref);
5331 +#ifdef CONFIG_PREEMPT_RT_BASE
5332 +               swork_queue(&qentry->irq_notify.swork);
5333 +#else
5334                 schedule_work(&qentry->irq_notify.work);
5335 +#endif
5336         }
5337
5338         /*
5339 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
5340 index 95f4c1bcdb4c..0be934799bff 100644
5341 --- a/drivers/thermal/x86_pkg_temp_thermal.c
5342 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
5343 @@ -29,6 +29,7 @@
5344  #include <linux/pm.h>
5345  #include <linux/thermal.h>
5346  #include <linux/debugfs.h>
5347 +#include <linux/swork.h>
5348  #include <asm/cpu_device_id.h>
5349  #include <asm/mce.h>
5350
5351 @@ -353,7 +354,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
5352         }
5353  }
5354
5355 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5356 +static void platform_thermal_notify_work(struct swork_event *event)
5357  {
5358         unsigned long flags;
5359         int cpu = smp_processor_id();
5360 @@ -370,7 +371,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5361                         pkg_work_scheduled[phy_id]) {
5362                 disable_pkg_thres_interrupt();
5363                 spin_unlock_irqrestore(&pkg_work_lock, flags);
5364 -               return -EINVAL;
5365 +               return;
5366         }
5367         pkg_work_scheduled[phy_id] = 1;
5368         spin_unlock_irqrestore(&pkg_work_lock, flags);
5369 @@ -379,9 +380,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5370         schedule_delayed_work_on(cpu,
5371                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
5372                                 msecs_to_jiffies(notify_delay_ms));
5373 +}
5374 +
5375 +#ifdef CONFIG_PREEMPT_RT_FULL
5376 +static struct swork_event notify_work;
5377 +
5378 +static int thermal_notify_work_init(void)
5379 +{
5380 +       int err;
5381 +
5382 +       err = swork_get();
5383 +       if (err)
5384 +               return err;
5385 +
5386 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
5387         return 0;
5388  }
5389
5390 +static void thermal_notify_work_cleanup(void)
5391 +{
5392 +       swork_put();
5393 +}
5394 +
5395 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5396 +{
5397 +       swork_queue(&notify_work);
5398 +       return 0;
5399 +}
5400 +
5401 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5402 +
5403 +static int thermal_notify_work_init(void) { return 0; }
5404 +
5405 +static void thermal_notify_work_cleanup(void) {  }
5406 +
5407 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5408 +{
5409 +       platform_thermal_notify_work(NULL);
5410 +
5411 +       return 0;
5412 +}
5413 +#endif /* CONFIG_PREEMPT_RT_FULL */
5414 +
5415  static int find_siblings_cpu(int cpu)
5416  {
5417         int i;
5418 @@ -585,6 +625,9 @@ static int __init pkg_temp_thermal_init(void)
5419         if (!x86_match_cpu(pkg_temp_thermal_ids))
5420                 return -ENODEV;
5421
5422 +       if (!thermal_notify_work_init())
5423 +               return -ENODEV;
5424 +
5425         spin_lock_init(&pkg_work_lock);
5426         platform_thermal_package_notify =
5427                         pkg_temp_thermal_platform_thermal_notify;
5428 @@ -609,7 +652,7 @@ static int __init pkg_temp_thermal_init(void)
5429         kfree(pkg_work_scheduled);
5430         platform_thermal_package_notify = NULL;
5431         platform_thermal_package_rate_control = NULL;
5432 -
5433 +       thermal_notify_work_cleanup();
5434         return -ENODEV;
5435  }
5436
5437 @@ -634,6 +677,7 @@ static void __exit pkg_temp_thermal_exit(void)
5438         mutex_unlock(&phy_dev_list_mutex);
5439         platform_thermal_package_notify = NULL;
5440         platform_thermal_package_rate_control = NULL;
5441 +       thermal_notify_work_cleanup();
5442         for_each_online_cpu(i)
5443                 cancel_delayed_work_sync(
5444                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5445 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
5446 index e8819aa20415..dd7f9bf45d6c 100644
5447 --- a/drivers/tty/serial/8250/8250_core.c
5448 +++ b/drivers/tty/serial/8250/8250_core.c
5449 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
5450
5451  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5452
5453 -#define PASS_LIMIT     512
5454 +/*
5455 + * On -rt we can have a more delays, and legitimately
5456 + * so - so don't drop work spuriously and spam the
5457 + * syslog:
5458 + */
5459 +#ifdef CONFIG_PREEMPT_RT_FULL
5460 +# define PASS_LIMIT    1000000
5461 +#else
5462 +# define PASS_LIMIT    512
5463 +#endif
5464
5465  #include <asm/serial.h>
5466  /*
5467 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
5468 index f6e4373a8850..4620b51b0e7c 100644
5469 --- a/drivers/tty/serial/8250/8250_port.c
5470 +++ b/drivers/tty/serial/8250/8250_port.c
5471 @@ -35,6 +35,7 @@
5472  #include <linux/nmi.h>
5473  #include <linux/mutex.h>
5474  #include <linux/slab.h>
5475 +#include <linux/kdb.h>
5476  #include <linux/uaccess.h>
5477  #include <linux/pm_runtime.h>
5478  #include <linux/timer.h>
5479 @@ -3143,9 +3144,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
5480
5481         serial8250_rpm_get(up);
5482
5483 -       if (port->sysrq)
5484 +       if (port->sysrq || oops_in_progress)
5485                 locked = 0;
5486 -       else if (oops_in_progress)
5487 +       else if (in_kdb_printk())
5488                 locked = spin_trylock_irqsave(&port->lock, flags);
5489         else
5490                 spin_lock_irqsave(&port->lock, flags);
5491 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
5492 index e2c33b9528d8..53af53c43e8c 100644
5493 --- a/drivers/tty/serial/amba-pl011.c
5494 +++ b/drivers/tty/serial/amba-pl011.c
5495 @@ -2194,13 +2194,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5496
5497         clk_enable(uap->clk);
5498
5499 -       local_irq_save(flags);
5500 +       /*
5501 +        * local_irq_save(flags);
5502 +        *
5503 +        * This local_irq_save() is nonsense. If we come in via sysrq
5504 +        * handling then interrupts are already disabled. Aside of
5505 +        * that the port.sysrq check is racy on SMP regardless.
5506 +       */
5507         if (uap->port.sysrq)
5508                 locked = 0;
5509         else if (oops_in_progress)
5510 -               locked = spin_trylock(&uap->port.lock);
5511 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5512         else
5513 -               spin_lock(&uap->port.lock);
5514 +               spin_lock_irqsave(&uap->port.lock, flags);
5515
5516         /*
5517          *      First save the CR then disable the interrupts
5518 @@ -2224,8 +2230,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5519                 pl011_write(old_cr, uap, REG_CR);
5520
5521         if (locked)
5522 -               spin_unlock(&uap->port.lock);
5523 -       local_irq_restore(flags);
5524 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5525
5526         clk_disable(uap->clk);
5527  }
5528 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
5529 index 472ba3c813c1..e654cb421fb7 100644
5530 --- a/drivers/tty/serial/omap-serial.c
5531 +++ b/drivers/tty/serial/omap-serial.c
5532 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
5533
5534         pm_runtime_get_sync(up->dev);
5535
5536 -       local_irq_save(flags);
5537 -       if (up->port.sysrq)
5538 -               locked = 0;
5539 -       else if (oops_in_progress)
5540 -               locked = spin_trylock(&up->port.lock);
5541 +       if (up->port.sysrq || oops_in_progress)
5542 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5543         else
5544 -               spin_lock(&up->port.lock);
5545 +               spin_lock_irqsave(&up->port.lock, flags);
5546
5547         /*
5548          * First save the IER then disable the interrupts
5549 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
5550         pm_runtime_mark_last_busy(up->dev);
5551         pm_runtime_put_autosuspend(up->dev);
5552         if (locked)
5553 -               spin_unlock(&up->port.lock);
5554 -       local_irq_restore(flags);
5555 +               spin_unlock_irqrestore(&up->port.lock, flags);
5556  }
5557
5558  static int __init
5559 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
5560 index fcc7aa248ce7..fb2c38d875f9 100644
5561 --- a/drivers/usb/core/hcd.c
5562 +++ b/drivers/usb/core/hcd.c
5563 @@ -1764,9 +1764,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
5564          * and no one may trigger the above deadlock situation when
5565          * running complete() in tasklet.
5566          */
5567 -       local_irq_save(flags);
5568 +       local_irq_save_nort(flags);
5569         urb->complete(urb);
5570 -       local_irq_restore(flags);
5571 +       local_irq_restore_nort(flags);
5572
5573         usb_anchor_resume_wakeups(anchor);
5574         atomic_dec(&urb->use_count);
5575 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
5576 index 4fce83266926..5e902a89d8e6 100644
5577 --- a/drivers/usb/gadget/function/f_fs.c
5578 +++ b/drivers/usb/gadget/function/f_fs.c
5579 @@ -1593,7 +1593,7 @@ static void ffs_data_put(struct ffs_data *ffs)
5580                 pr_info("%s(): freeing\n", __func__);
5581                 ffs_data_clear(ffs);
5582                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5583 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5584 +                      swait_active(&ffs->ep0req_completion.wait));
5585                 kfree(ffs->dev_name);
5586                 kfree(ffs);
5587         }
5588 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
5589 index f69dbd4bcd18..3b7638322f9f 100644
5590 --- a/drivers/usb/gadget/legacy/inode.c
5591 +++ b/drivers/usb/gadget/legacy/inode.c
5592 @@ -347,7 +347,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5593         spin_unlock_irq (&epdata->dev->lock);
5594
5595         if (likely (value == 0)) {
5596 -               value = wait_event_interruptible (done.wait, done.done);
5597 +               value = swait_event_interruptible (done.wait, done.done);
5598                 if (value != 0) {
5599                         spin_lock_irq (&epdata->dev->lock);
5600                         if (likely (epdata->ep != NULL)) {
5601 @@ -356,7 +356,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5602                                 usb_ep_dequeue (epdata->ep, epdata->req);
5603                                 spin_unlock_irq (&epdata->dev->lock);
5604
5605 -                               wait_event (done.wait, done.done);
5606 +                               swait_event (done.wait, done.done);
5607                                 if (epdata->status == -ECONNRESET)
5608                                         epdata->status = -EINTR;
5609                         } else {
5610 diff --git a/fs/aio.c b/fs/aio.c
5611 index 0fcb49ad67d4..211ebc21e4db 100644
5612 --- a/fs/aio.c
5613 +++ b/fs/aio.c
5614 @@ -40,6 +40,7 @@
5615  #include <linux/ramfs.h>
5616  #include <linux/percpu-refcount.h>
5617  #include <linux/mount.h>
5618 +#include <linux/swork.h>
5619
5620  #include <asm/kmap_types.h>
5621  #include <asm/uaccess.h>
5622 @@ -115,7 +116,7 @@ struct kioctx {
5623         struct page             **ring_pages;
5624         long                    nr_pages;
5625
5626 -       struct work_struct      free_work;
5627 +       struct swork_event      free_work;
5628
5629         /*
5630          * signals when all in-flight requests are done
5631 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
5632                 .mount          = aio_mount,
5633                 .kill_sb        = kill_anon_super,
5634         };
5635 +       BUG_ON(swork_get());
5636         aio_mnt = kern_mount(&aio_fs);
5637         if (IS_ERR(aio_mnt))
5638                 panic("Failed to create aio fs mount.");
5639 @@ -581,9 +583,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
5640         return cancel(&kiocb->common);
5641  }
5642
5643 -static void free_ioctx(struct work_struct *work)
5644 +static void free_ioctx(struct swork_event *sev)
5645  {
5646 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5647 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5648
5649         pr_debug("freeing %p\n", ctx);
5650
5651 @@ -602,8 +604,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5652         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5653                 complete(&ctx->rq_wait->comp);
5654
5655 -       INIT_WORK(&ctx->free_work, free_ioctx);
5656 -       schedule_work(&ctx->free_work);
5657 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5658 +       swork_queue(&ctx->free_work);
5659  }
5660
5661  /*
5662 @@ -611,9 +613,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5663   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5664   * now it's safe to cancel any that need to be.
5665   */
5666 -static void free_ioctx_users(struct percpu_ref *ref)
5667 +static void free_ioctx_users_work(struct swork_event *sev)
5668  {
5669 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5670 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5671         struct aio_kiocb *req;
5672
5673         spin_lock_irq(&ctx->ctx_lock);
5674 @@ -632,6 +634,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
5675         percpu_ref_put(&ctx->reqs);
5676  }
5677
5678 +static void free_ioctx_users(struct percpu_ref *ref)
5679 +{
5680 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5681 +
5682 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5683 +       swork_queue(&ctx->free_work);
5684 +}
5685 +
5686  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5687  {
5688         unsigned i, new_nr;
5689 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
5690 index a1fba4285277..3796769b4cd1 100644
5691 --- a/fs/autofs4/autofs_i.h
5692 +++ b/fs/autofs4/autofs_i.h
5693 @@ -31,6 +31,7 @@
5694  #include <linux/sched.h>
5695  #include <linux/mount.h>
5696  #include <linux/namei.h>
5697 +#include <linux/delay.h>
5698  #include <asm/current.h>
5699  #include <linux/uaccess.h>
5700
5701 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
5702 index d8e6d421c27f..2e689ab1306b 100644
5703 --- a/fs/autofs4/expire.c
5704 +++ b/fs/autofs4/expire.c
5705 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
5706                         parent = p->d_parent;
5707                         if (!spin_trylock(&parent->d_lock)) {
5708                                 spin_unlock(&p->d_lock);
5709 -                               cpu_relax();
5710 +                               cpu_chill();
5711                                 goto relock;
5712                         }
5713                         spin_unlock(&p->d_lock);
5714 diff --git a/fs/buffer.c b/fs/buffer.c
5715 index 5d8f496d624e..48074bd91ea3 100644
5716 --- a/fs/buffer.c
5717 +++ b/fs/buffer.c
5718 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5719          * decide that the page is now completely done.
5720          */
5721         first = page_buffers(page);
5722 -       local_irq_save(flags);
5723 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5724 +       flags = bh_uptodate_lock_irqsave(first);
5725         clear_buffer_async_read(bh);
5726         unlock_buffer(bh);
5727         tmp = bh;
5728 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5729                 }
5730                 tmp = tmp->b_this_page;
5731         } while (tmp != bh);
5732 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5733 -       local_irq_restore(flags);
5734 +       bh_uptodate_unlock_irqrestore(first, flags);
5735
5736         /*
5737          * If none of the buffers had errors and they are all
5738 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5739         return;
5740
5741  still_busy:
5742 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5743 -       local_irq_restore(flags);
5744 -       return;
5745 +       bh_uptodate_unlock_irqrestore(first, flags);
5746  }
5747
5748  /*
5749 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5750         }
5751
5752         first = page_buffers(page);
5753 -       local_irq_save(flags);
5754 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5755 +       flags = bh_uptodate_lock_irqsave(first);
5756
5757         clear_buffer_async_write(bh);
5758         unlock_buffer(bh);
5759 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5760                 }
5761                 tmp = tmp->b_this_page;
5762         }
5763 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5764 -       local_irq_restore(flags);
5765 +       bh_uptodate_unlock_irqrestore(first, flags);
5766         end_page_writeback(page);
5767         return;
5768
5769  still_busy:
5770 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5771 -       local_irq_restore(flags);
5772 -       return;
5773 +       bh_uptodate_unlock_irqrestore(first, flags);
5774  }
5775  EXPORT_SYMBOL(end_buffer_async_write);
5776
5777 @@ -3383,6 +3375,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
5778         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5779         if (ret) {
5780                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5781 +               buffer_head_init_locks(ret);
5782                 preempt_disable();
5783                 __this_cpu_inc(bh_accounting.nr);
5784                 recalc_bh_state();
5785 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5786 index a27fc8791551..791aecb7c1ac 100644
5787 --- a/fs/cifs/readdir.c
5788 +++ b/fs/cifs/readdir.c
5789 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
5790         struct inode *inode;
5791         struct super_block *sb = parent->d_sb;
5792         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5793 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5794 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5795
5796         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5797
5798 diff --git a/fs/dcache.c b/fs/dcache.c
5799 index 67957f5b325c..f0719b2f1be5 100644
5800 --- a/fs/dcache.c
5801 +++ b/fs/dcache.c
5802 @@ -19,6 +19,7 @@
5803  #include <linux/mm.h>
5804  #include <linux/fs.h>
5805  #include <linux/fsnotify.h>
5806 +#include <linux/delay.h>
5807  #include <linux/slab.h>
5808  #include <linux/init.h>
5809  #include <linux/hash.h>
5810 @@ -777,6 +778,8 @@ static inline bool fast_dput(struct dentry *dentry)
5811   */
5812  void dput(struct dentry *dentry)
5813  {
5814 +       struct dentry *parent;
5815 +
5816         if (unlikely(!dentry))
5817                 return;
5818
5819 @@ -815,9 +818,18 @@ void dput(struct dentry *dentry)
5820         return;
5821
5822  kill_it:
5823 -       dentry = dentry_kill(dentry);
5824 -       if (dentry) {
5825 -               cond_resched();
5826 +       parent = dentry_kill(dentry);
5827 +       if (parent) {
5828 +               int r;
5829 +
5830 +               if (parent == dentry) {
5831 +                       /* the task with the highest priority won't schedule */
5832 +                       r = cond_resched();
5833 +                       if (!r)
5834 +                               cpu_chill();
5835 +               } else {
5836 +                       dentry = parent;
5837 +               }
5838                 goto repeat;
5839         }
5840  }
5841 @@ -2352,7 +2364,7 @@ void d_delete(struct dentry * dentry)
5842         if (dentry->d_lockref.count == 1) {
5843                 if (!spin_trylock(&inode->i_lock)) {
5844                         spin_unlock(&dentry->d_lock);
5845 -                       cpu_relax();
5846 +                       cpu_chill();
5847                         goto again;
5848                 }
5849                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5850 @@ -2397,9 +2409,10 @@ EXPORT_SYMBOL(d_rehash);
5851  static inline unsigned start_dir_add(struct inode *dir)
5852  {
5853
5854 +       preempt_disable_rt();
5855         for (;;) {
5856 -               unsigned n = dir->i_dir_seq;
5857 -               if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
5858 +               unsigned n = dir->__i_dir_seq;
5859 +               if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
5860                         return n;
5861                 cpu_relax();
5862         }
5863 @@ -2407,26 +2420,30 @@ static inline unsigned start_dir_add(struct inode *dir)
5864
5865  static inline void end_dir_add(struct inode *dir, unsigned n)
5866  {
5867 -       smp_store_release(&dir->i_dir_seq, n + 2);
5868 +       smp_store_release(&dir->__i_dir_seq, n + 2);
5869 +       preempt_enable_rt();
5870  }
5871
5872  static void d_wait_lookup(struct dentry *dentry)
5873  {
5874 -       if (d_in_lookup(dentry)) {
5875 -               DECLARE_WAITQUEUE(wait, current);
5876 -               add_wait_queue(dentry->d_wait, &wait);
5877 -               do {
5878 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5879 -                       spin_unlock(&dentry->d_lock);
5880 -                       schedule();
5881 -                       spin_lock(&dentry->d_lock);
5882 -               } while (d_in_lookup(dentry));
5883 -       }
5884 +       struct swait_queue __wait;
5885 +
5886 +       if (!d_in_lookup(dentry))
5887 +               return;
5888 +
5889 +       INIT_LIST_HEAD(&__wait.task_list);
5890 +       do {
5891 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5892 +               spin_unlock(&dentry->d_lock);
5893 +               schedule();
5894 +               spin_lock(&dentry->d_lock);
5895 +       } while (d_in_lookup(dentry));
5896 +       finish_swait(dentry->d_wait, &__wait);
5897  }
5898
5899  struct dentry *d_alloc_parallel(struct dentry *parent,
5900                                 const struct qstr *name,
5901 -                               wait_queue_head_t *wq)
5902 +                               struct swait_queue_head *wq)
5903  {
5904         unsigned int hash = name->hash;
5905         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5906 @@ -2440,7 +2457,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
5907
5908  retry:
5909         rcu_read_lock();
5910 -       seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1;
5911 +       seq = smp_load_acquire(&parent->d_inode->__i_dir_seq) & ~1;
5912         r_seq = read_seqbegin(&rename_lock);
5913         dentry = __d_lookup_rcu(parent, name, &d_seq);
5914         if (unlikely(dentry)) {
5915 @@ -2462,7 +2479,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
5916                 goto retry;
5917         }
5918         hlist_bl_lock(b);
5919 -       if (unlikely(parent->d_inode->i_dir_seq != seq)) {
5920 +       if (unlikely(parent->d_inode->__i_dir_seq != seq)) {
5921                 hlist_bl_unlock(b);
5922                 rcu_read_unlock();
5923                 goto retry;
5924 @@ -2535,7 +2552,7 @@ void __d_lookup_done(struct dentry *dentry)
5925         hlist_bl_lock(b);
5926         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5927         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5928 -       wake_up_all(dentry->d_wait);
5929 +       swake_up_all(dentry->d_wait);
5930         dentry->d_wait = NULL;
5931         hlist_bl_unlock(b);
5932         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5933 @@ -3632,6 +3649,11 @@ EXPORT_SYMBOL(d_genocide);
5934
5935  void __init vfs_caches_init_early(void)
5936  {
5937 +       int i;
5938 +
5939 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5940 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5941 +
5942         dcache_init_early();
5943         inode_init_early();
5944  }
5945 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
5946 index 3cbc30413add..41a94f552aab 100644
5947 --- a/fs/eventpoll.c
5948 +++ b/fs/eventpoll.c
5949 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
5950   */
5951  static void ep_poll_safewake(wait_queue_head_t *wq)
5952  {
5953 -       int this_cpu = get_cpu();
5954 +       int this_cpu = get_cpu_light();
5955
5956         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5957                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5958
5959 -       put_cpu();
5960 +       put_cpu_light();
5961  }
5962
5963  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5964 diff --git a/fs/exec.c b/fs/exec.c
5965 index b8c43be24751..71f4c6ec2bb8 100644
5966 --- a/fs/exec.c
5967 +++ b/fs/exec.c
5968 @@ -1038,12 +1038,14 @@ static int exec_mmap(struct mm_struct *mm)
5969                 }
5970         }
5971         task_lock(tsk);
5972 +       preempt_disable_rt();
5973         active_mm = tsk->active_mm;
5974         tsk->mm = mm;
5975         tsk->active_mm = mm;
5976         activate_mm(active_mm, mm);
5977         tsk->mm->vmacache_seqnum = 0;
5978         vmacache_flush(tsk);
5979 +       preempt_enable_rt();
5980         task_unlock(tsk);
5981         if (old_mm) {
5982                 up_read(&old_mm->mmap_sem);
5983 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
5984 index 0094923e5ebf..37fa06ef5417 100644
5985 --- a/fs/ext4/page-io.c
5986 +++ b/fs/ext4/page-io.c
5987 @@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *bio)
5988                  * We check all buffers in the page under BH_Uptodate_Lock
5989                  * to avoid races with other end io clearing async_write flags
5990                  */
5991 -               local_irq_save(flags);
5992 -               bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
5993 +               flags = bh_uptodate_lock_irqsave(head);
5994                 do {
5995                         if (bh_offset(bh) < bio_start ||
5996                             bh_offset(bh) + bh->b_size > bio_end) {
5997 @@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *bio)
5998                         if (bio->bi_error)
5999                                 buffer_io_error(bh);
6000                 } while ((bh = bh->b_this_page) != head);
6001 -               bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
6002 -               local_irq_restore(flags);
6003 +               bh_uptodate_unlock_irqrestore(head, flags);
6004                 if (!under_io) {
6005  #ifdef CONFIG_EXT4_FS_ENCRYPTION
6006                         if (data_page)
6007 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
6008 index 4bbad745415a..5f91ca248ab0 100644
6009 --- a/fs/fuse/dir.c
6010 +++ b/fs/fuse/dir.c
6011 @@ -1191,7 +1191,7 @@ static int fuse_direntplus_link(struct file *file,
6012         struct inode *dir = d_inode(parent);
6013         struct fuse_conn *fc;
6014         struct inode *inode;
6015 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6016 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6017
6018         if (!o->nodeid) {
6019                 /*
6020 diff --git a/fs/inode.c b/fs/inode.c
6021 index 920aa0b1c6b0..3d6b5fd1bf06 100644
6022 --- a/fs/inode.c
6023 +++ b/fs/inode.c
6024 @@ -153,7 +153,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
6025         inode->i_bdev = NULL;
6026         inode->i_cdev = NULL;
6027         inode->i_link = NULL;
6028 -       inode->i_dir_seq = 0;
6029 +       inode->__i_dir_seq = 0;
6030         inode->i_rdev = 0;
6031         inode->dirtied_when = 0;
6032
6033 diff --git a/fs/libfs.c b/fs/libfs.c
6034 index 9588780ad43e..9b37abd354c9 100644
6035 --- a/fs/libfs.c
6036 +++ b/fs/libfs.c
6037 @@ -89,7 +89,7 @@ static struct dentry *next_positive(struct dentry *parent,
6038                                     struct list_head *from,
6039                                     int count)
6040  {
6041 -       unsigned *seq = &parent->d_inode->i_dir_seq, n;
6042 +       unsigned *seq = &parent->d_inode->__i_dir_seq, n;
6043         struct dentry *res;
6044         struct list_head *p;
6045         bool skipped;
6046 @@ -122,8 +122,9 @@ static struct dentry *next_positive(struct dentry *parent,
6047  static void move_cursor(struct dentry *cursor, struct list_head *after)
6048  {
6049         struct dentry *parent = cursor->d_parent;
6050 -       unsigned n, *seq = &parent->d_inode->i_dir_seq;
6051 +       unsigned n, *seq = &parent->d_inode->__i_dir_seq;
6052         spin_lock(&parent->d_lock);
6053 +       preempt_disable_rt();
6054         for (;;) {
6055                 n = *seq;
6056                 if (!(n & 1) && cmpxchg(seq, n, n + 1) == n)
6057 @@ -136,6 +137,7 @@ static void move_cursor(struct dentry *cursor, struct list_head *after)
6058         else
6059                 list_add_tail(&cursor->d_child, &parent->d_subdirs);
6060         smp_store_release(seq, n + 2);
6061 +       preempt_enable_rt();
6062         spin_unlock(&parent->d_lock);
6063  }
6064
6065 diff --git a/fs/locks.c b/fs/locks.c
6066 index 22c5b4aa4961..269c6a44449a 100644
6067 --- a/fs/locks.c
6068 +++ b/fs/locks.c
6069 @@ -935,7 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
6070                         return -ENOMEM;
6071         }
6072
6073 -       percpu_down_read_preempt_disable(&file_rwsem);
6074 +       percpu_down_read(&file_rwsem);
6075         spin_lock(&ctx->flc_lock);
6076         if (request->fl_flags & FL_ACCESS)
6077                 goto find_conflict;
6078 @@ -976,7 +976,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
6079
6080  out:
6081         spin_unlock(&ctx->flc_lock);
6082 -       percpu_up_read_preempt_enable(&file_rwsem);
6083 +       percpu_up_read(&file_rwsem);
6084         if (new_fl)
6085                 locks_free_lock(new_fl);
6086         locks_dispose_list(&dispose);
6087 @@ -1013,7 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
6088                 new_fl2 = locks_alloc_lock();
6089         }
6090
6091 -       percpu_down_read_preempt_disable(&file_rwsem);
6092 +       percpu_down_read(&file_rwsem);
6093         spin_lock(&ctx->flc_lock);
6094         /*
6095          * New lock request. Walk all POSIX locks and look for conflicts. If
6096 @@ -1185,7 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
6097         }
6098   out:
6099         spin_unlock(&ctx->flc_lock);
6100 -       percpu_up_read_preempt_enable(&file_rwsem);
6101 +       percpu_up_read(&file_rwsem);
6102         /*
6103          * Free any unused locks.
6104          */
6105 @@ -1460,7 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
6106                 return error;
6107         }
6108
6109 -       percpu_down_read_preempt_disable(&file_rwsem);
6110 +       percpu_down_read(&file_rwsem);
6111         spin_lock(&ctx->flc_lock);
6112
6113         time_out_leases(inode, &dispose);
6114 @@ -1512,13 +1512,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
6115         locks_insert_block(fl, new_fl);
6116         trace_break_lease_block(inode, new_fl);
6117         spin_unlock(&ctx->flc_lock);
6118 -       percpu_up_read_preempt_enable(&file_rwsem);
6119 +       percpu_up_read(&file_rwsem);
6120
6121         locks_dispose_list(&dispose);
6122         error = wait_event_interruptible_timeout(new_fl->fl_wait,
6123                                                 !new_fl->fl_next, break_time);
6124
6125 -       percpu_down_read_preempt_disable(&file_rwsem);
6126 +       percpu_down_read(&file_rwsem);
6127         spin_lock(&ctx->flc_lock);
6128         trace_break_lease_unblock(inode, new_fl);
6129         locks_delete_block(new_fl);
6130 @@ -1535,7 +1535,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
6131         }
6132  out:
6133         spin_unlock(&ctx->flc_lock);
6134 -       percpu_up_read_preempt_enable(&file_rwsem);
6135 +       percpu_up_read(&file_rwsem);
6136         locks_dispose_list(&dispose);
6137         locks_free_lock(new_fl);
6138         return error;
6139 @@ -1609,7 +1609,7 @@ int fcntl_getlease(struct file *filp)
6140
6141         ctx = smp_load_acquire(&inode->i_flctx);
6142         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
6143 -               percpu_down_read_preempt_disable(&file_rwsem);
6144 +               percpu_down_read(&file_rwsem);
6145                 spin_lock(&ctx->flc_lock);
6146                 time_out_leases(inode, &dispose);
6147                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
6148 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
6149                         break;
6150                 }
6151                 spin_unlock(&ctx->flc_lock);
6152 -               percpu_up_read_preempt_enable(&file_rwsem);
6153 +               percpu_up_read(&file_rwsem);
6154
6155                 locks_dispose_list(&dispose);
6156         }
6157 @@ -1694,7 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
6158                 return -EINVAL;
6159         }
6160
6161 -       percpu_down_read_preempt_disable(&file_rwsem);
6162 +       percpu_down_read(&file_rwsem);
6163         spin_lock(&ctx->flc_lock);
6164         time_out_leases(inode, &dispose);
6165         error = check_conflicting_open(dentry, arg, lease->fl_flags);
6166 @@ -1765,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
6167                 lease->fl_lmops->lm_setup(lease, priv);
6168  out:
6169         spin_unlock(&ctx->flc_lock);
6170 -       percpu_up_read_preempt_enable(&file_rwsem);
6171 +       percpu_up_read(&file_rwsem);
6172         locks_dispose_list(&dispose);
6173         if (is_deleg)
6174                 inode_unlock(inode);
6175 @@ -1788,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
6176                 return error;
6177         }
6178
6179 -       percpu_down_read_preempt_disable(&file_rwsem);
6180 +       percpu_down_read(&file_rwsem);
6181         spin_lock(&ctx->flc_lock);
6182         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
6183                 if (fl->fl_file == filp &&
6184 @@ -1801,7 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
6185         if (victim)
6186                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
6187         spin_unlock(&ctx->flc_lock);
6188 -       percpu_up_read_preempt_enable(&file_rwsem);
6189 +       percpu_up_read(&file_rwsem);
6190         locks_dispose_list(&dispose);
6191         return error;
6192  }
6193 @@ -2532,13 +2532,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
6194         if (list_empty(&ctx->flc_lease))
6195                 return;
6196
6197 -       percpu_down_read_preempt_disable(&file_rwsem);
6198 +       percpu_down_read(&file_rwsem);
6199         spin_lock(&ctx->flc_lock);
6200         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
6201                 if (filp == fl->fl_file)
6202                         lease_modify(fl, F_UNLCK, &dispose);
6203         spin_unlock(&ctx->flc_lock);
6204 -       percpu_up_read_preempt_enable(&file_rwsem);
6205 +       percpu_up_read(&file_rwsem);
6206
6207         locks_dispose_list(&dispose);
6208  }
6209 diff --git a/fs/namei.c b/fs/namei.c
6210 index e7d125c23aa6..072a2f724437 100644
6211 --- a/fs/namei.c
6212 +++ b/fs/namei.c
6213 @@ -1626,7 +1626,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
6214  {
6215         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
6216         struct inode *inode = dir->d_inode;
6217 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6218 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6219
6220         inode_lock_shared(inode);
6221         /* Don't go there if it's already dead */
6222 @@ -3089,7 +3089,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
6223         struct dentry *dentry;
6224         int error, create_error = 0;
6225         umode_t mode = op->mode;
6226 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6227 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6228
6229         if (unlikely(IS_DEADDIR(dir_inode)))
6230                 return -ENOENT;
6231 diff --git a/fs/namespace.c b/fs/namespace.c
6232 index d7360f9897b4..da188c6966a3 100644
6233 --- a/fs/namespace.c
6234 +++ b/fs/namespace.c
6235 @@ -14,6 +14,7 @@
6236  #include <linux/mnt_namespace.h>
6237  #include <linux/user_namespace.h>
6238  #include <linux/namei.h>
6239 +#include <linux/delay.h>
6240  #include <linux/security.h>
6241  #include <linux/idr.h>
6242  #include <linux/init.h>                /* init_rootfs */
6243 @@ -357,8 +358,11 @@ int __mnt_want_write(struct vfsmount *m)
6244          * incremented count after it has set MNT_WRITE_HOLD.
6245          */
6246         smp_mb();
6247 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
6248 -               cpu_relax();
6249 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
6250 +               preempt_enable();
6251 +               cpu_chill();
6252 +               preempt_disable();
6253 +       }
6254         /*
6255          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
6256          * be set to match its requirements. So we must not load that until
6257 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
6258 index dff600ae0d74..d726d2e09353 100644
6259 --- a/fs/nfs/delegation.c
6260 +++ b/fs/nfs/delegation.c
6261 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
6262                 sp = state->owner;
6263                 /* Block nfs4_proc_unlck */
6264                 mutex_lock(&sp->so_delegreturn_mutex);
6265 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6266 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
6267                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
6268                 if (!err)
6269                         err = nfs_delegation_claim_locks(ctx, state, stateid);
6270 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6271 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
6272                         err = -EAGAIN;
6273                 mutex_unlock(&sp->so_delegreturn_mutex);
6274                 put_nfs_open_context(ctx);
6275 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
6276 index 65566d5fcf39..5f08183dddcd 100644
6277 --- a/fs/nfs/dir.c
6278 +++ b/fs/nfs/dir.c
6279 @@ -485,7 +485,7 @@ static
6280  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
6281  {
6282         struct qstr filename = QSTR_INIT(entry->name, entry->len);
6283 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6284 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6285         struct dentry *dentry;
6286         struct dentry *alias;
6287         struct inode *dir = d_inode(parent);
6288 @@ -1492,7 +1492,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
6289                     struct file *file, unsigned open_flags,
6290                     umode_t mode, int *opened)
6291  {
6292 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6293 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6294         struct nfs_open_context *ctx;
6295         struct dentry *res;
6296         struct iattr attr = { .ia_valid = ATTR_OPEN };
6297 @@ -1807,7 +1807,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6298
6299         trace_nfs_rmdir_enter(dir, dentry);
6300         if (d_really_is_positive(dentry)) {
6301 +#ifdef CONFIG_PREEMPT_RT_BASE
6302 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
6303 +#else
6304                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6305 +#endif
6306                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6307                 /* Ensure the VFS deletes this inode */
6308                 switch (error) {
6309 @@ -1817,7 +1821,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6310                 case -ENOENT:
6311                         nfs_dentry_handle_enoent(dentry);
6312                 }
6313 +#ifdef CONFIG_PREEMPT_RT_BASE
6314 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
6315 +#else
6316                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6317 +#endif
6318         } else
6319                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6320         trace_nfs_rmdir_exit(dir, dentry, error);
6321 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
6322 index 76ae25661d3f..89159d298278 100644
6323 --- a/fs/nfs/inode.c
6324 +++ b/fs/nfs/inode.c
6325 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
6326         nfsi->nrequests = 0;
6327         nfsi->commit_info.ncommit = 0;
6328         atomic_set(&nfsi->commit_info.rpcs_out, 0);
6329 +#ifdef CONFIG_PREEMPT_RT_BASE
6330 +       sema_init(&nfsi->rmdir_sem, 1);
6331 +#else
6332         init_rwsem(&nfsi->rmdir_sem);
6333 +#endif
6334         nfs4_init_once(nfsi);
6335  }
6336
6337 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
6338 index 1452177c822d..f43b01d54c59 100644
6339 --- a/fs/nfs/nfs4_fs.h
6340 +++ b/fs/nfs/nfs4_fs.h
6341 @@ -111,7 +111,7 @@ struct nfs4_state_owner {
6342         unsigned long        so_flags;
6343         struct list_head     so_states;
6344         struct nfs_seqid_counter so_seqid;
6345 -       seqcount_t           so_reclaim_seqcount;
6346 +       seqlock_t            so_reclaim_seqlock;
6347         struct mutex         so_delegreturn_mutex;
6348  };
6349
6350 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
6351 index 4638654e26f3..5dd6fd555c72 100644
6352 --- a/fs/nfs/nfs4proc.c
6353 +++ b/fs/nfs/nfs4proc.c
6354 @@ -2691,7 +2691,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6355         unsigned int seq;
6356         int ret;
6357
6358 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6359 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6360
6361         ret = _nfs4_proc_open(opendata);
6362         if (ret != 0)
6363 @@ -2729,7 +2729,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6364
6365         if (d_inode(dentry) == state->inode) {
6366                 nfs_inode_attach_open_context(ctx);
6367 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6368 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
6369                         nfs4_schedule_stateid_recovery(server, state);
6370         }
6371  out:
6372 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
6373 index 71deeae6eefd..4be6999299dc 100644
6374 --- a/fs/nfs/nfs4state.c
6375 +++ b/fs/nfs/nfs4state.c
6376 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
6377         nfs4_init_seqid_counter(&sp->so_seqid);
6378         atomic_set(&sp->so_count, 1);
6379         INIT_LIST_HEAD(&sp->so_lru);
6380 -       seqcount_init(&sp->so_reclaim_seqcount);
6381 +       seqlock_init(&sp->so_reclaim_seqlock);
6382         mutex_init(&sp->so_delegreturn_mutex);
6383         return sp;
6384  }
6385 @@ -1498,8 +1498,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6386          * recovering after a network partition or a reboot from a
6387          * server that doesn't support a grace period.
6388          */
6389 +#ifdef CONFIG_PREEMPT_RT_FULL
6390 +       write_seqlock(&sp->so_reclaim_seqlock);
6391 +#else
6392 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6393 +#endif
6394         spin_lock(&sp->so_lock);
6395 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
6396  restart:
6397         list_for_each_entry(state, &sp->so_states, open_states) {
6398                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
6399 @@ -1568,14 +1572,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6400                 spin_lock(&sp->so_lock);
6401                 goto restart;
6402         }
6403 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6404         spin_unlock(&sp->so_lock);
6405 +#ifdef CONFIG_PREEMPT_RT_FULL
6406 +       write_sequnlock(&sp->so_reclaim_seqlock);
6407 +#else
6408 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6409 +#endif
6410         return 0;
6411  out_err:
6412         nfs4_put_open_state(state);
6413 -       spin_lock(&sp->so_lock);
6414 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6415 -       spin_unlock(&sp->so_lock);
6416 +#ifdef CONFIG_PREEMPT_RT_FULL
6417 +       write_sequnlock(&sp->so_reclaim_seqlock);
6418 +#else
6419 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6420 +#endif
6421         return status;
6422  }
6423
6424 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
6425 index 191aa577dd1f..58990c8f52e0 100644
6426 --- a/fs/nfs/unlink.c
6427 +++ b/fs/nfs/unlink.c
6428 @@ -12,7 +12,7 @@
6429  #include <linux/sunrpc/clnt.h>
6430  #include <linux/nfs_fs.h>
6431  #include <linux/sched.h>
6432 -#include <linux/wait.h>
6433 +#include <linux/swait.h>
6434  #include <linux/namei.h>
6435  #include <linux/fsnotify.h>
6436
6437 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
6438                 rpc_restart_call_prepare(task);
6439  }
6440
6441 +#ifdef CONFIG_PREEMPT_RT_BASE
6442 +static void nfs_down_anon(struct semaphore *sema)
6443 +{
6444 +       down(sema);
6445 +}
6446 +
6447 +static void nfs_up_anon(struct semaphore *sema)
6448 +{
6449 +       up(sema);
6450 +}
6451 +
6452 +#else
6453 +static void nfs_down_anon(struct rw_semaphore *rwsem)
6454 +{
6455 +       down_read_non_owner(rwsem);
6456 +}
6457 +
6458 +static void nfs_up_anon(struct rw_semaphore *rwsem)
6459 +{
6460 +       up_read_non_owner(rwsem);
6461 +}
6462 +#endif
6463 +
6464  /**
6465   * nfs_async_unlink_release - Release the sillydelete data.
6466   * @task: rpc_task of the sillydelete
6467 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
6468         struct dentry *dentry = data->dentry;
6469         struct super_block *sb = dentry->d_sb;
6470
6471 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6472 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6473         d_lookup_done(dentry);
6474         nfs_free_unlinkdata(data);
6475         dput(dentry);
6476 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6477         struct inode *dir = d_inode(dentry->d_parent);
6478         struct dentry *alias;
6479
6480 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6481 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6482         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6483         if (IS_ERR(alias)) {
6484 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6485 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6486                 return 0;
6487         }
6488         if (!d_in_lookup(alias)) {
6489 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6490                         ret = 0;
6491                 spin_unlock(&alias->d_lock);
6492                 dput(alias);
6493 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6494 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6495                 /*
6496                  * If we'd displaced old cached devname, free it.  At that
6497                  * point dentry is definitely not a root, so we won't need
6498 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
6499                 goto out_free_name;
6500         }
6501         data->res.dir_attr = &data->dir_attr;
6502 -       init_waitqueue_head(&data->wq);
6503 +       init_swait_queue_head(&data->wq);
6504
6505         status = -EBUSY;
6506         spin_lock(&dentry->d_lock);
6507 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
6508 index fe251f187ff8..e89da4fb14c2 100644
6509 --- a/fs/ntfs/aops.c
6510 +++ b/fs/ntfs/aops.c
6511 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6512                         ofs = 0;
6513                         if (file_ofs < init_size)
6514                                 ofs = init_size - file_ofs;
6515 -                       local_irq_save(flags);
6516 +                       local_irq_save_nort(flags);
6517                         kaddr = kmap_atomic(page);
6518                         memset(kaddr + bh_offset(bh) + ofs, 0,
6519                                         bh->b_size - ofs);
6520                         flush_dcache_page(page);
6521                         kunmap_atomic(kaddr);
6522 -                       local_irq_restore(flags);
6523 +                       local_irq_restore_nort(flags);
6524                 }
6525         } else {
6526                 clear_buffer_uptodate(bh);
6527 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6528                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6529         }
6530         first = page_buffers(page);
6531 -       local_irq_save(flags);
6532 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6533 +       flags = bh_uptodate_lock_irqsave(first);
6534         clear_buffer_async_read(bh);
6535         unlock_buffer(bh);
6536         tmp = bh;
6537 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6538                 }
6539                 tmp = tmp->b_this_page;
6540         } while (tmp != bh);
6541 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6542 -       local_irq_restore(flags);
6543 +       bh_uptodate_unlock_irqrestore(first, flags);
6544         /*
6545          * If none of the buffers had errors then we can set the page uptodate,
6546          * but we first have to perform the post read mst fixups, if the
6547 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6548                 recs = PAGE_SIZE / rec_size;
6549                 /* Should have been verified before we got here... */
6550                 BUG_ON(!recs);
6551 -               local_irq_save(flags);
6552 +               local_irq_save_nort(flags);
6553                 kaddr = kmap_atomic(page);
6554                 for (i = 0; i < recs; i++)
6555                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6556                                         i * rec_size), rec_size);
6557                 kunmap_atomic(kaddr);
6558 -               local_irq_restore(flags);
6559 +               local_irq_restore_nort(flags);
6560                 flush_dcache_page(page);
6561                 if (likely(page_uptodate && !PageError(page)))
6562                         SetPageUptodate(page);
6563 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6564         unlock_page(page);
6565         return;
6566  still_busy:
6567 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6568 -       local_irq_restore(flags);
6569 -       return;
6570 +       bh_uptodate_unlock_irqrestore(first, flags);
6571  }
6572
6573  /**
6574 diff --git a/fs/proc/base.c b/fs/proc/base.c
6575 index e67fec3c9856..0edc16f95596 100644
6576 --- a/fs/proc/base.c
6577 +++ b/fs/proc/base.c
6578 @@ -1834,7 +1834,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
6579
6580         child = d_hash_and_lookup(dir, &qname);
6581         if (!child) {
6582 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6583 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6584                 child = d_alloc_parallel(dir, &qname, &wq);
6585                 if (IS_ERR(child))
6586                         goto end_instantiate;
6587 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
6588 index d4e37acd4821..000cea46434a 100644
6589 --- a/fs/proc/proc_sysctl.c
6590 +++ b/fs/proc/proc_sysctl.c
6591 @@ -632,7 +632,7 @@ static bool proc_sys_fill_cache(struct file *file,
6592
6593         child = d_lookup(dir, &qname);
6594         if (!child) {
6595 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6596 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6597                 child = d_alloc_parallel(dir, &qname, &wq);
6598                 if (IS_ERR(child))
6599                         return false;
6600 diff --git a/fs/timerfd.c b/fs/timerfd.c
6601 index ab8dd1538381..5580853f57dd 100644
6602 --- a/fs/timerfd.c
6603 +++ b/fs/timerfd.c
6604 @@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags,
6605                                 break;
6606                 }
6607                 spin_unlock_irq(&ctx->wqh.lock);
6608 -               cpu_relax();
6609 +               if (isalarm(ctx))
6610 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6611 +               else
6612 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6613         }
6614
6615         /*
6616 diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
6617 index d31cd1ebd8e9..5ea3f933a52a 100644
6618 --- a/fs/xfs/xfs_aops.c
6619 +++ b/fs/xfs/xfs_aops.c
6620 @@ -112,8 +112,7 @@ xfs_finish_page_writeback(
6621         ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
6622         ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
6623
6624 -       local_irq_save(flags);
6625 -       bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
6626 +       flags = bh_uptodate_lock_irqsave(head);
6627         do {
6628                 if (off >= bvec->bv_offset &&
6629                     off < bvec->bv_offset + bvec->bv_len) {
6630 @@ -136,8 +135,7 @@ xfs_finish_page_writeback(
6631                 }
6632                 off += bh->b_size;
6633         } while ((bh = bh->b_this_page) != head);
6634 -       bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
6635 -       local_irq_restore(flags);
6636 +       bh_uptodate_unlock_irqrestore(head, flags);
6637
6638         if (!busy)
6639                 end_page_writeback(bvec->bv_page);
6640 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
6641 index e861a24f06f2..b5c97d3059c7 100644
6642 --- a/include/acpi/platform/aclinux.h
6643 +++ b/include/acpi/platform/aclinux.h
6644 @@ -133,6 +133,7 @@
6645
6646  #define acpi_cache_t                        struct kmem_cache
6647  #define acpi_spinlock                       spinlock_t *
6648 +#define acpi_raw_spinlock              raw_spinlock_t *
6649  #define acpi_cpu_flags                      unsigned long
6650
6651  /* Use native linux version of acpi_os_allocate_zeroed */
6652 @@ -151,6 +152,20 @@
6653  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6654  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6655
6656 +#define acpi_os_create_raw_lock(__handle)                      \
6657 +({                                                             \
6658 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6659 +                                                               \
6660 +        if (lock) {                                            \
6661 +               *(__handle) = lock;                             \
6662 +               raw_spin_lock_init(*(__handle));                \
6663 +        }                                                      \
6664 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6665 + })
6666 +
6667 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6668 +
6669 +
6670  /*
6671   * OSL interfaces used by debugger/disassembler
6672   */
6673 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
6674 index 6f96247226a4..fa53a21263c2 100644
6675 --- a/include/asm-generic/bug.h
6676 +++ b/include/asm-generic/bug.h
6677 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
6678  # define WARN_ON_SMP(x)                        ({0;})
6679  #endif
6680
6681 +#ifdef CONFIG_PREEMPT_RT_BASE
6682 +# define BUG_ON_RT(c)                  BUG_ON(c)
6683 +# define BUG_ON_NONRT(c)               do { } while (0)
6684 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6685 +# define WARN_ON_NONRT(condition)      do { } while (0)
6686 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6687 +#else
6688 +# define BUG_ON_RT(c)                  do { } while (0)
6689 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6690 +# define WARN_ON_RT(condition)         do { } while (0)
6691 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6692 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6693 +#endif
6694 +
6695  #endif /* __ASSEMBLY__ */
6696
6697  #endif
6698 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
6699 index 535ab2e13d2e..cfc246899473 100644
6700 --- a/include/linux/blk-mq.h
6701 +++ b/include/linux/blk-mq.h
6702 @@ -209,7 +209,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
6703         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6704  }
6705
6706 -
6707 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6708  int blk_mq_request_started(struct request *rq);
6709  void blk_mq_start_request(struct request *rq);
6710  void blk_mq_end_request(struct request *rq, int error);
6711 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
6712 index f6a816129856..ec7a4676f8a8 100644
6713 --- a/include/linux/blkdev.h
6714 +++ b/include/linux/blkdev.h
6715 @@ -89,6 +89,7 @@ struct request {
6716         struct list_head queuelist;
6717         union {
6718                 struct call_single_data csd;
6719 +               struct work_struct work;
6720                 u64 fifo_time;
6721         };
6722
6723 @@ -467,7 +468,7 @@ struct request_queue {
6724         struct throtl_data *td;
6725  #endif
6726         struct rcu_head         rcu_head;
6727 -       wait_queue_head_t       mq_freeze_wq;
6728 +       struct swait_queue_head mq_freeze_wq;
6729         struct percpu_ref       q_usage_counter;
6730         struct list_head        all_q_node;
6731
6732 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
6733 index 8fdcb783197d..d07dbeec7bc1 100644
6734 --- a/include/linux/bottom_half.h
6735 +++ b/include/linux/bottom_half.h
6736 @@ -3,6 +3,39 @@
6737
6738  #include <linux/preempt.h>
6739
6740 +#ifdef CONFIG_PREEMPT_RT_FULL
6741 +
6742 +extern void __local_bh_disable(void);
6743 +extern void _local_bh_enable(void);
6744 +extern void __local_bh_enable(void);
6745 +
6746 +static inline void local_bh_disable(void)
6747 +{
6748 +       __local_bh_disable();
6749 +}
6750 +
6751 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6752 +{
6753 +       __local_bh_disable();
6754 +}
6755 +
6756 +static inline void local_bh_enable(void)
6757 +{
6758 +       __local_bh_enable();
6759 +}
6760 +
6761 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6762 +{
6763 +       __local_bh_enable();
6764 +}
6765 +
6766 +static inline void local_bh_enable_ip(unsigned long ip)
6767 +{
6768 +       __local_bh_enable();
6769 +}
6770 +
6771 +#else
6772 +
6773  #ifdef CONFIG_TRACE_IRQFLAGS
6774  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6775  #else
6776 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
6777  {
6778         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6779  }
6780 +#endif
6781
6782  #endif /* _LINUX_BH_H */
6783 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
6784 index 4431ea2c8802..0744157a97ca 100644
6785 --- a/include/linux/buffer_head.h
6786 +++ b/include/linux/buffer_head.h
6787 @@ -75,8 +75,50 @@ struct buffer_head {
6788         struct address_space *b_assoc_map;      /* mapping this buffer is
6789                                                    associated with */
6790         atomic_t b_count;               /* users using this buffer_head */
6791 +#ifdef CONFIG_PREEMPT_RT_BASE
6792 +       spinlock_t b_uptodate_lock;
6793 +#if IS_ENABLED(CONFIG_JBD2)
6794 +       spinlock_t b_state_lock;
6795 +       spinlock_t b_journal_head_lock;
6796 +#endif
6797 +#endif
6798  };
6799
6800 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6801 +{
6802 +       unsigned long flags;
6803 +
6804 +#ifndef CONFIG_PREEMPT_RT_BASE
6805 +       local_irq_save(flags);
6806 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6807 +#else
6808 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6809 +#endif
6810 +       return flags;
6811 +}
6812 +
6813 +static inline void
6814 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6815 +{
6816 +#ifndef CONFIG_PREEMPT_RT_BASE
6817 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6818 +       local_irq_restore(flags);
6819 +#else
6820 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6821 +#endif
6822 +}
6823 +
6824 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6825 +{
6826 +#ifdef CONFIG_PREEMPT_RT_BASE
6827 +       spin_lock_init(&bh->b_uptodate_lock);
6828 +#if IS_ENABLED(CONFIG_JBD2)
6829 +       spin_lock_init(&bh->b_state_lock);
6830 +       spin_lock_init(&bh->b_journal_head_lock);
6831 +#endif
6832 +#endif
6833 +}
6834 +
6835  /*
6836   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6837   * and buffer_foo() functions.
6838 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
6839 index 6fb1c34cf805..ccd2a5addb56 100644
6840 --- a/include/linux/cgroup-defs.h
6841 +++ b/include/linux/cgroup-defs.h
6842 @@ -16,6 +16,7 @@
6843  #include <linux/percpu-refcount.h>
6844  #include <linux/percpu-rwsem.h>
6845  #include <linux/workqueue.h>
6846 +#include <linux/swork.h>
6847
6848  #ifdef CONFIG_CGROUPS
6849
6850 @@ -138,6 +139,7 @@ struct cgroup_subsys_state {
6851         /* percpu_ref killing and RCU release */
6852         struct rcu_head rcu_head;
6853         struct work_struct destroy_work;
6854 +       struct swork_event destroy_swork;
6855  };
6856
6857  /*
6858 diff --git a/include/linux/completion.h b/include/linux/completion.h
6859 index 5d5aaae3af43..3bca1590e29f 100644
6860 --- a/include/linux/completion.h
6861 +++ b/include/linux/completion.h
6862 @@ -7,8 +7,7 @@
6863   * Atomic wait-for-completion handler data structures.
6864   * See kernel/sched/completion.c for details.
6865   */
6866 -
6867 -#include <linux/wait.h>
6868 +#include <linux/swait.h>
6869
6870  /*
6871   * struct completion - structure used to maintain state for a "completion"
6872 @@ -24,11 +23,11 @@
6873   */
6874  struct completion {
6875         unsigned int done;
6876 -       wait_queue_head_t wait;
6877 +       struct swait_queue_head wait;
6878  };
6879
6880  #define COMPLETION_INITIALIZER(work) \
6881 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6882 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6883
6884  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6885         ({ init_completion(&work); work; })
6886 @@ -73,7 +72,7 @@ struct completion {
6887  static inline void init_completion(struct completion *x)
6888  {
6889         x->done = 0;
6890 -       init_waitqueue_head(&x->wait);
6891 +       init_swait_queue_head(&x->wait);
6892  }
6893
6894  /**
6895 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
6896 index e571128ad99a..5e52d28c20c1 100644
6897 --- a/include/linux/cpu.h
6898 +++ b/include/linux/cpu.h
6899 @@ -182,6 +182,8 @@ extern void get_online_cpus(void);
6900  extern void put_online_cpus(void);
6901  extern void cpu_hotplug_disable(void);
6902  extern void cpu_hotplug_enable(void);
6903 +extern void pin_current_cpu(void);
6904 +extern void unpin_current_cpu(void);
6905  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6906  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6907  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6908 @@ -199,6 +201,8 @@ static inline void cpu_hotplug_done(void) {}
6909  #define put_online_cpus()      do { } while (0)
6910  #define cpu_hotplug_disable()  do { } while (0)
6911  #define cpu_hotplug_enable()   do { } while (0)
6912 +static inline void pin_current_cpu(void) { }
6913 +static inline void unpin_current_cpu(void) { }
6914  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6915  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6916  /* These aren't inline functions due to a GCC bug. */
6917 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
6918 index ff295e166b2c..d532c60f3fb5 100644
6919 --- a/include/linux/dcache.h
6920 +++ b/include/linux/dcache.h
6921 @@ -11,6 +11,7 @@
6922  #include <linux/rcupdate.h>
6923  #include <linux/lockref.h>
6924  #include <linux/stringhash.h>
6925 +#include <linux/wait.h>
6926
6927  struct path;
6928  struct vfsmount;
6929 @@ -100,7 +101,7 @@ struct dentry {
6930
6931         union {
6932                 struct list_head d_lru;         /* LRU list */
6933 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6934 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6935         };
6936         struct list_head d_child;       /* child of parent list */
6937         struct list_head d_subdirs;     /* our children */
6938 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
6939  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6940  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6941  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6942 -                                       wait_queue_head_t *);
6943 +                                       struct swait_queue_head *);
6944  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6945  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6946  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6947 diff --git a/include/linux/delay.h b/include/linux/delay.h
6948 index a6ecb34cf547..37caab306336 100644
6949 --- a/include/linux/delay.h
6950 +++ b/include/linux/delay.h
6951 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
6952         msleep(seconds * 1000);
6953  }
6954
6955 +#ifdef CONFIG_PREEMPT_RT_FULL
6956 +extern void cpu_chill(void);
6957 +#else
6958 +# define cpu_chill()   cpu_relax()
6959 +#endif
6960 +
6961  #endif /* defined(_LINUX_DELAY_H) */
6962 diff --git a/include/linux/fs.h b/include/linux/fs.h
6963 index d705ae084edd..ab1946f4a729 100644
6964 --- a/include/linux/fs.h
6965 +++ b/include/linux/fs.h
6966 @@ -688,7 +688,7 @@ struct inode {
6967                 struct block_device     *i_bdev;
6968                 struct cdev             *i_cdev;
6969                 char                    *i_link;
6970 -               unsigned                i_dir_seq;
6971 +               unsigned                __i_dir_seq;
6972         };
6973
6974         __u32                   i_generation;
6975 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
6976 index bb3f3297062a..a117a33ef72c 100644
6977 --- a/include/linux/highmem.h
6978 +++ b/include/linux/highmem.h
6979 @@ -7,6 +7,7 @@
6980  #include <linux/mm.h>
6981  #include <linux/uaccess.h>
6982  #include <linux/hardirq.h>
6983 +#include <linux/sched.h>
6984
6985  #include <asm/cacheflush.h>
6986
6987 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
6988
6989  static inline void *kmap_atomic(struct page *page)
6990  {
6991 -       preempt_disable();
6992 +       preempt_disable_nort();
6993         pagefault_disable();
6994         return page_address(page);
6995  }
6996 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
6997  static inline void __kunmap_atomic(void *addr)
6998  {
6999         pagefault_enable();
7000 -       preempt_enable();
7001 +       preempt_enable_nort();
7002  }
7003
7004  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
7005 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
7006
7007  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
7008
7009 +#ifndef CONFIG_PREEMPT_RT_FULL
7010  DECLARE_PER_CPU(int, __kmap_atomic_idx);
7011 +#endif
7012
7013  static inline int kmap_atomic_idx_push(void)
7014  {
7015 +#ifndef CONFIG_PREEMPT_RT_FULL
7016         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
7017
7018 -#ifdef CONFIG_DEBUG_HIGHMEM
7019 +# ifdef CONFIG_DEBUG_HIGHMEM
7020         WARN_ON_ONCE(in_irq() && !irqs_disabled());
7021         BUG_ON(idx >= KM_TYPE_NR);
7022 -#endif
7023 +# endif
7024         return idx;
7025 +#else
7026 +       current->kmap_idx++;
7027 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
7028 +       return current->kmap_idx - 1;
7029 +#endif
7030  }
7031
7032  static inline int kmap_atomic_idx(void)
7033  {
7034 +#ifndef CONFIG_PREEMPT_RT_FULL
7035         return __this_cpu_read(__kmap_atomic_idx) - 1;
7036 +#else
7037 +       return current->kmap_idx - 1;
7038 +#endif
7039  }
7040
7041  static inline void kmap_atomic_idx_pop(void)
7042  {
7043 -#ifdef CONFIG_DEBUG_HIGHMEM
7044 +#ifndef CONFIG_PREEMPT_RT_FULL
7045 +# ifdef CONFIG_DEBUG_HIGHMEM
7046         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
7047
7048         BUG_ON(idx < 0);
7049 -#else
7050 +# else
7051         __this_cpu_dec(__kmap_atomic_idx);
7052 +# endif
7053 +#else
7054 +       current->kmap_idx--;
7055 +# ifdef CONFIG_DEBUG_HIGHMEM
7056 +       BUG_ON(current->kmap_idx < 0);
7057 +# endif
7058  #endif
7059  }
7060
7061 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
7062 index 5e00f80b1535..a34e10b55cde 100644
7063 --- a/include/linux/hrtimer.h
7064 +++ b/include/linux/hrtimer.h
7065 @@ -87,6 +87,9 @@ enum hrtimer_restart {
7066   * @function:  timer expiry callback function
7067   * @base:      pointer to the timer base (per cpu and per clock)
7068   * @state:     state information (See bit values above)
7069 + * @cb_entry:  list entry to defer timers from hardirq context
7070 + * @irqsafe:   timer can run in hardirq context
7071 + * @praecox:   timer expiry time if expired at the time of programming
7072   * @is_rel:    Set if the timer was armed relative
7073   * @start_pid:  timer statistics field to store the pid of the task which
7074   *             started the timer
7075 @@ -103,6 +106,11 @@ struct hrtimer {
7076         enum hrtimer_restart            (*function)(struct hrtimer *);
7077         struct hrtimer_clock_base       *base;
7078         u8                              state;
7079 +       struct list_head                cb_entry;
7080 +       int                             irqsafe;
7081 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
7082 +       ktime_t                         praecox;
7083 +#endif
7084         u8                              is_rel;
7085  #ifdef CONFIG_TIMER_STATS
7086         int                             start_pid;
7087 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
7088         struct task_struct *task;
7089  };
7090
7091 -#ifdef CONFIG_64BIT
7092  # define HRTIMER_CLOCK_BASE_ALIGN      64
7093 -#else
7094 -# define HRTIMER_CLOCK_BASE_ALIGN      32
7095 -#endif
7096
7097  /**
7098   * struct hrtimer_clock_base - the timer base for a specific clock
7099 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
7100   *                     timer to a base on another cpu.
7101   * @clockid:           clock id for per_cpu support
7102   * @active:            red black tree root node for the active timers
7103 + * @expired:           list head for deferred timers.
7104   * @get_time:          function to retrieve the current time of the clock
7105   * @offset:            offset of this clock to the monotonic base
7106   */
7107 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
7108         int                     index;
7109         clockid_t               clockid;
7110         struct timerqueue_head  active;
7111 +       struct list_head        expired;
7112         ktime_t                 (*get_time)(void);
7113         ktime_t                 offset;
7114  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
7115 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
7116         raw_spinlock_t                  lock;
7117         seqcount_t                      seq;
7118         struct hrtimer                  *running;
7119 +       struct hrtimer                  *running_soft;
7120         unsigned int                    cpu;
7121         unsigned int                    active_bases;
7122         unsigned int                    clock_was_set_seq;
7123 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
7124         unsigned int                    nr_hangs;
7125         unsigned int                    max_hang_time;
7126  #endif
7127 +#ifdef CONFIG_PREEMPT_RT_BASE
7128 +       wait_queue_head_t               wait;
7129 +#endif
7130         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
7131  } ____cacheline_aligned;
7132
7133 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
7134         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
7135  }
7136
7137 +/* Softirq preemption could deadlock timer removal */
7138 +#ifdef CONFIG_PREEMPT_RT_BASE
7139 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
7140 +#else
7141 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
7142 +#endif
7143 +
7144  /* Query timers: */
7145  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
7146
7147 @@ -436,9 +453,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
7148   * Helper function to check, whether the timer is running the callback
7149   * function
7150   */
7151 -static inline int hrtimer_callback_running(struct hrtimer *timer)
7152 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
7153  {
7154 -       return timer->base->cpu_base->running == timer;
7155 +       if (timer->base->cpu_base->running == timer)
7156 +               return 1;
7157 +#ifdef CONFIG_PREEMPT_RT_BASE
7158 +       if (timer->base->cpu_base->running_soft == timer)
7159 +               return 1;
7160 +#endif
7161 +       return 0;
7162  }
7163
7164  /* Forward a hrtimer so it expires after now: */
7165 diff --git a/include/linux/idr.h b/include/linux/idr.h
7166 index 083d61e92706..5899796f50cb 100644
7167 --- a/include/linux/idr.h
7168 +++ b/include/linux/idr.h
7169 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
7170   * Each idr_preload() should be matched with an invocation of this
7171   * function.  See idr_preload() for details.
7172   */
7173 +#ifdef CONFIG_PREEMPT_RT_FULL
7174 +void idr_preload_end(void);
7175 +#else
7176  static inline void idr_preload_end(void)
7177  {
7178         preempt_enable();
7179  }
7180 +#endif
7181
7182  /**
7183   * idr_find - return pointer for given id
7184 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
7185 index 325f649d77ff..a56e263f5005 100644
7186 --- a/include/linux/init_task.h
7187 +++ b/include/linux/init_task.h
7188 @@ -150,6 +150,12 @@ extern struct task_group root_task_group;
7189  # define INIT_PERF_EVENTS(tsk)
7190  #endif
7191
7192 +#ifdef CONFIG_PREEMPT_RT_BASE
7193 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
7194 +#else
7195 +# define INIT_TIMER_LIST
7196 +#endif
7197 +
7198  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
7199  # define INIT_VTIME(tsk)                                               \
7200         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
7201 @@ -164,6 +170,7 @@ extern struct task_group root_task_group;
7202  #ifdef CONFIG_RT_MUTEXES
7203  # define INIT_RT_MUTEXES(tsk)                                          \
7204         .pi_waiters = RB_ROOT,                                          \
7205 +       .pi_top_task = NULL,                                            \
7206         .pi_waiters_leftmost = NULL,
7207  #else
7208  # define INIT_RT_MUTEXES(tsk)
7209 @@ -250,6 +257,7 @@ extern struct task_group root_task_group;
7210         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
7211         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
7212         .timer_slack_ns = 50000, /* 50 usec default slack */            \
7213 +       INIT_TIMER_LIST                                                 \
7214         .pids = {                                                       \
7215                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
7216                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
7217 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
7218 index 72f0721f75e7..480972ae47d3 100644
7219 --- a/include/linux/interrupt.h
7220 +++ b/include/linux/interrupt.h
7221 @@ -14,6 +14,7 @@
7222  #include <linux/hrtimer.h>
7223  #include <linux/kref.h>
7224  #include <linux/workqueue.h>
7225 +#include <linux/swork.h>
7226
7227  #include <linux/atomic.h>
7228  #include <asm/ptrace.h>
7229 @@ -61,6 +62,7 @@
7230   *                interrupt handler after suspending interrupts. For system
7231   *                wakeup devices users need to implement wakeup detection in
7232   *                their interrupt handlers.
7233 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
7234   */
7235  #define IRQF_SHARED            0x00000080
7236  #define IRQF_PROBE_SHARED      0x00000100
7237 @@ -74,6 +76,7 @@
7238  #define IRQF_NO_THREAD         0x00010000
7239  #define IRQF_EARLY_RESUME      0x00020000
7240  #define IRQF_COND_SUSPEND      0x00040000
7241 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
7242
7243  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
7244
7245 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
7246  #ifdef CONFIG_LOCKDEP
7247  # define local_irq_enable_in_hardirq() do { } while (0)
7248  #else
7249 -# define local_irq_enable_in_hardirq() local_irq_enable()
7250 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
7251  #endif
7252
7253  extern void disable_irq_nosync(unsigned int irq);
7254 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
7255   * struct irq_affinity_notify - context for notification of IRQ affinity changes
7256   * @irq:               Interrupt to which notification applies
7257   * @kref:              Reference count, for internal use
7258 + * @swork:             Swork item, for internal use
7259   * @work:              Work item, for internal use
7260   * @notify:            Function to be called on change.  This will be
7261   *                     called in process context.
7262 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
7263  struct irq_affinity_notify {
7264         unsigned int irq;
7265         struct kref kref;
7266 +#ifdef CONFIG_PREEMPT_RT_BASE
7267 +       struct swork_event swork;
7268 +#else
7269         struct work_struct work;
7270 +#endif
7271         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
7272         void (*release)(struct kref *ref);
7273  };
7274 @@ -406,9 +414,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
7275                                  bool state);
7276
7277  #ifdef CONFIG_IRQ_FORCED_THREADING
7278 +# ifndef CONFIG_PREEMPT_RT_BASE
7279  extern bool force_irqthreads;
7280 +# else
7281 +#  define force_irqthreads     (true)
7282 +# endif
7283  #else
7284 -#define force_irqthreads       (0)
7285 +#define force_irqthreads       (false)
7286  #endif
7287
7288  #ifndef __ARCH_SET_SOFTIRQ_PENDING
7289 @@ -465,9 +477,10 @@ struct softirq_action
7290         void    (*action)(struct softirq_action *);
7291  };
7292
7293 +#ifndef CONFIG_PREEMPT_RT_FULL
7294  asmlinkage void do_softirq(void);
7295  asmlinkage void __do_softirq(void);
7296 -
7297 +static inline void thread_do_softirq(void) { do_softirq(); }
7298  #ifdef __ARCH_HAS_DO_SOFTIRQ
7299  void do_softirq_own_stack(void);
7300  #else
7301 @@ -476,13 +489,25 @@ static inline void do_softirq_own_stack(void)
7302         __do_softirq();
7303  }
7304  #endif
7305 +#else
7306 +extern void thread_do_softirq(void);
7307 +#endif
7308
7309  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
7310  extern void softirq_init(void);
7311  extern void __raise_softirq_irqoff(unsigned int nr);
7312 +#ifdef CONFIG_PREEMPT_RT_FULL
7313 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
7314 +#else
7315 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
7316 +{
7317 +       __raise_softirq_irqoff(nr);
7318 +}
7319 +#endif
7320
7321  extern void raise_softirq_irqoff(unsigned int nr);
7322  extern void raise_softirq(unsigned int nr);
7323 +extern void softirq_check_pending_idle(void);
7324
7325  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
7326
7327 @@ -504,8 +529,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
7328       to be executed on some cpu at least once after this.
7329     * If the tasklet is already scheduled, but its execution is still not
7330       started, it will be executed only once.
7331 -   * If this tasklet is already running on another CPU (or schedule is called
7332 -     from tasklet itself), it is rescheduled for later.
7333 +   * If this tasklet is already running on another CPU, it is rescheduled
7334 +     for later.
7335 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
7336     * Tasklet is strictly serialized wrt itself, but not
7337       wrt another tasklets. If client needs some intertask synchronization,
7338       he makes it with spinlocks.
7339 @@ -530,27 +556,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
7340  enum
7341  {
7342         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
7343 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
7344 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
7345 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
7346  };
7347
7348 -#ifdef CONFIG_SMP
7349 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
7350 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
7351 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
7352 +
7353 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
7354  static inline int tasklet_trylock(struct tasklet_struct *t)
7355  {
7356         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
7357  }
7358
7359 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
7360 +{
7361 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
7362 +}
7363 +
7364  static inline void tasklet_unlock(struct tasklet_struct *t)
7365  {
7366         smp_mb__before_atomic();
7367         clear_bit(TASKLET_STATE_RUN, &(t)->state);
7368  }
7369
7370 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
7371 -{
7372 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
7373 -}
7374 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
7375 +
7376  #else
7377  #define tasklet_trylock(t) 1
7378 +#define tasklet_tryunlock(t)   1
7379  #define tasklet_unlock_wait(t) do { } while (0)
7380  #define tasklet_unlock(t) do { } while (0)
7381  #endif
7382 @@ -599,12 +634,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
7383         smp_mb();
7384  }
7385
7386 -static inline void tasklet_enable(struct tasklet_struct *t)
7387 -{
7388 -       smp_mb__before_atomic();
7389 -       atomic_dec(&t->count);
7390 -}
7391 -
7392 +extern void tasklet_enable(struct tasklet_struct *t);
7393  extern void tasklet_kill(struct tasklet_struct *t);
7394  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
7395  extern void tasklet_init(struct tasklet_struct *t,
7396 @@ -635,6 +665,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
7397         tasklet_kill(&ttimer->tasklet);
7398  }
7399
7400 +#ifdef CONFIG_PREEMPT_RT_FULL
7401 +extern void softirq_early_init(void);
7402 +#else
7403 +static inline void softirq_early_init(void) { }
7404 +#endif
7405 +
7406  /*
7407   * Autoprobing for irqs:
7408   *
7409 diff --git a/include/linux/irq.h b/include/linux/irq.h
7410 index 39e3254e5769..8ebac94fbb9f 100644
7411 --- a/include/linux/irq.h
7412 +++ b/include/linux/irq.h
7413 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
7414   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
7415   *                               it from the spurious interrupt detection
7416   *                               mechanism and from core side polling.
7417 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
7418   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
7419   */
7420  enum {
7421 @@ -99,13 +100,14 @@ enum {
7422         IRQ_PER_CPU_DEVID       = (1 << 17),
7423         IRQ_IS_POLLED           = (1 << 18),
7424         IRQ_DISABLE_UNLAZY      = (1 << 19),
7425 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
7426  };
7427
7428  #define IRQF_MODIFY_MASK       \
7429         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
7430          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
7431          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
7432 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
7433 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
7434
7435  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
7436
7437 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
7438 index 47b9ebd4a74f..2543aab05daa 100644
7439 --- a/include/linux/irq_work.h
7440 +++ b/include/linux/irq_work.h
7441 @@ -16,6 +16,7 @@
7442  #define IRQ_WORK_BUSY          2UL
7443  #define IRQ_WORK_FLAGS         3UL
7444  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
7445 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
7446
7447  struct irq_work {
7448         unsigned long flags;
7449 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
7450  static inline void irq_work_run(void) { }
7451  #endif
7452
7453 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
7454 +void irq_work_tick_soft(void);
7455 +#else
7456 +static inline void irq_work_tick_soft(void) { }
7457 +#endif
7458 +
7459  #endif /* _LINUX_IRQ_WORK_H */
7460 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
7461 index c9be57931b58..eeeb540971ae 100644
7462 --- a/include/linux/irqdesc.h
7463 +++ b/include/linux/irqdesc.h
7464 @@ -66,6 +66,7 @@ struct irq_desc {
7465         unsigned int            irqs_unhandled;
7466         atomic_t                threads_handled;
7467         int                     threads_handled_last;
7468 +       u64                     random_ip;
7469         raw_spinlock_t          lock;
7470         struct cpumask          *percpu_enabled;
7471         const struct cpumask    *percpu_affinity;
7472 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
7473 index 5dd1272d1ab2..9b77034f7c5e 100644
7474 --- a/include/linux/irqflags.h
7475 +++ b/include/linux/irqflags.h
7476 @@ -25,8 +25,6 @@
7477  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
7478  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
7479  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
7480 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
7481 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
7482  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
7483  #else
7484  # define trace_hardirqs_on()           do { } while (0)
7485 @@ -39,9 +37,15 @@
7486  # define trace_softirqs_enabled(p)     0
7487  # define trace_hardirq_enter()         do { } while (0)
7488  # define trace_hardirq_exit()          do { } while (0)
7489 +# define INIT_TRACE_IRQFLAGS
7490 +#endif
7491 +
7492 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
7493 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
7494 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
7495 +#else
7496  # define lockdep_softirq_enter()       do { } while (0)
7497  # define lockdep_softirq_exit()                do { } while (0)
7498 -# define INIT_TRACE_IRQFLAGS
7499  #endif
7500
7501  #if defined(CONFIG_IRQSOFF_TRACER) || \
7502 @@ -148,4 +152,23 @@
7503
7504  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
7505
7506 +/*
7507 + * local_irq* variants depending on RT/!RT
7508 + */
7509 +#ifdef CONFIG_PREEMPT_RT_FULL
7510 +# define local_irq_disable_nort()      do { } while (0)
7511 +# define local_irq_enable_nort()       do { } while (0)
7512 +# define local_irq_save_nort(flags)    local_save_flags(flags)
7513 +# define local_irq_restore_nort(flags) (void)(flags)
7514 +# define local_irq_disable_rt()                local_irq_disable()
7515 +# define local_irq_enable_rt()         local_irq_enable()
7516 +#else
7517 +# define local_irq_disable_nort()      local_irq_disable()
7518 +# define local_irq_enable_nort()       local_irq_enable()
7519 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7520 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7521 +# define local_irq_disable_rt()                do { } while (0)
7522 +# define local_irq_enable_rt()         do { } while (0)
7523 +#endif
7524 +
7525  #endif
7526 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
7527 index dfaa1f4dcb0c..d57dd06544a1 100644
7528 --- a/include/linux/jbd2.h
7529 +++ b/include/linux/jbd2.h
7530 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
7531
7532  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7533  {
7534 +#ifndef CONFIG_PREEMPT_RT_BASE
7535         bit_spin_lock(BH_State, &bh->b_state);
7536 +#else
7537 +       spin_lock(&bh->b_state_lock);
7538 +#endif
7539  }
7540
7541  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7542  {
7543 +#ifndef CONFIG_PREEMPT_RT_BASE
7544         return bit_spin_trylock(BH_State, &bh->b_state);
7545 +#else
7546 +       return spin_trylock(&bh->b_state_lock);
7547 +#endif
7548  }
7549
7550  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7551  {
7552 +#ifndef CONFIG_PREEMPT_RT_BASE
7553         return bit_spin_is_locked(BH_State, &bh->b_state);
7554 +#else
7555 +       return spin_is_locked(&bh->b_state_lock);
7556 +#endif
7557  }
7558
7559  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7560  {
7561 +#ifndef CONFIG_PREEMPT_RT_BASE
7562         bit_spin_unlock(BH_State, &bh->b_state);
7563 +#else
7564 +       spin_unlock(&bh->b_state_lock);
7565 +#endif
7566  }
7567
7568  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7569  {
7570 +#ifndef CONFIG_PREEMPT_RT_BASE
7571         bit_spin_lock(BH_JournalHead, &bh->b_state);
7572 +#else
7573 +       spin_lock(&bh->b_journal_head_lock);
7574 +#endif
7575  }
7576
7577  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7578  {
7579 +#ifndef CONFIG_PREEMPT_RT_BASE
7580         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7581 +#else
7582 +       spin_unlock(&bh->b_journal_head_lock);
7583 +#endif
7584  }
7585
7586  #define J_ASSERT(assert)       BUG_ON(!(assert))
7587 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
7588 index 410decacff8f..0861bebfc188 100644
7589 --- a/include/linux/kdb.h
7590 +++ b/include/linux/kdb.h
7591 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
7592  extern __printf(1, 2) int kdb_printf(const char *, ...);
7593  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7594
7595 +#define in_kdb_printk()        (kdb_trap_printk)
7596  extern void kdb_init(int level);
7597
7598  /* Access to kdb specific polling devices */
7599 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
7600  extern int kdb_unregister(char *);
7601  #else /* ! CONFIG_KGDB_KDB */
7602  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7603 +#define in_kdb_printk() (0)
7604  static inline void kdb_init(int level) {}
7605  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7606                                char *help, short minlen) { return 0; }
7607 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
7608 index bc6ed52a39b9..7894d55e4998 100644
7609 --- a/include/linux/kernel.h
7610 +++ b/include/linux/kernel.h
7611 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
7612   */
7613  # define might_sleep() \
7614         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7615 +
7616 +# define might_sleep_no_state_check() \
7617 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7618  # define sched_annotate_sleep()        (current->task_state_change = 0)
7619  #else
7620    static inline void ___might_sleep(const char *file, int line,
7621 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
7622    static inline void __might_sleep(const char *file, int line,
7623                                    int preempt_offset) { }
7624  # define might_sleep() do { might_resched(); } while (0)
7625 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7626  # define sched_annotate_sleep() do { } while (0)
7627  #endif
7628
7629 @@ -488,6 +492,7 @@ extern enum system_states {
7630         SYSTEM_HALT,
7631         SYSTEM_POWER_OFF,
7632         SYSTEM_RESTART,
7633 +       SYSTEM_SUSPEND,
7634  } system_state;
7635
7636  #define TAINT_PROPRIETARY_MODULE       0
7637 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
7638 index cb483305e1f5..4e5062316bb6 100644
7639 --- a/include/linux/list_bl.h
7640 +++ b/include/linux/list_bl.h
7641 @@ -2,6 +2,7 @@
7642  #define _LINUX_LIST_BL_H
7643
7644  #include <linux/list.h>
7645 +#include <linux/spinlock.h>
7646  #include <linux/bit_spinlock.h>
7647
7648  /*
7649 @@ -32,13 +33,24 @@
7650
7651  struct hlist_bl_head {
7652         struct hlist_bl_node *first;
7653 +#ifdef CONFIG_PREEMPT_RT_BASE
7654 +       raw_spinlock_t lock;
7655 +#endif
7656  };
7657
7658  struct hlist_bl_node {
7659         struct hlist_bl_node *next, **pprev;
7660  };
7661 -#define INIT_HLIST_BL_HEAD(ptr) \
7662 -       ((ptr)->first = NULL)
7663 +
7664 +#ifdef CONFIG_PREEMPT_RT_BASE
7665 +#define INIT_HLIST_BL_HEAD(h)          \
7666 +do {                                   \
7667 +       (h)->first = NULL;              \
7668 +       raw_spin_lock_init(&(h)->lock); \
7669 +} while (0)
7670 +#else
7671 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7672 +#endif
7673
7674  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7675  {
7676 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
7677
7678  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7679  {
7680 +#ifndef CONFIG_PREEMPT_RT_BASE
7681         bit_spin_lock(0, (unsigned long *)b);
7682 +#else
7683 +       raw_spin_lock(&b->lock);
7684 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7685 +       __set_bit(0, (unsigned long *)b);
7686 +#endif
7687 +#endif
7688  }
7689
7690  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7691  {
7692 +#ifndef CONFIG_PREEMPT_RT_BASE
7693         __bit_spin_unlock(0, (unsigned long *)b);
7694 +#else
7695 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7696 +       __clear_bit(0, (unsigned long *)b);
7697 +#endif
7698 +       raw_spin_unlock(&b->lock);
7699 +#endif
7700  }
7701
7702  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7703 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
7704 new file mode 100644
7705 index 000000000000..280f884a05a3
7706 --- /dev/null
7707 +++ b/include/linux/locallock.h
7708 @@ -0,0 +1,287 @@
7709 +#ifndef _LINUX_LOCALLOCK_H
7710 +#define _LINUX_LOCALLOCK_H
7711 +
7712 +#include <linux/percpu.h>
7713 +#include <linux/spinlock.h>
7714 +
7715 +#ifdef CONFIG_PREEMPT_RT_BASE
7716 +
7717 +#ifdef CONFIG_DEBUG_SPINLOCK
7718 +# define LL_WARN(cond) WARN_ON(cond)
7719 +#else
7720 +# define LL_WARN(cond) do { } while (0)
7721 +#endif
7722 +
7723 +/*
7724 + * per cpu lock based substitute for local_irq_*()
7725 + */
7726 +struct local_irq_lock {
7727 +       spinlock_t              lock;
7728 +       struct task_struct      *owner;
7729 +       int                     nestcnt;
7730 +       unsigned long           flags;
7731 +};
7732 +
7733 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7734 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7735 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7736 +
7737 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7738 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7739 +
7740 +#define local_irq_lock_init(lvar)                                      \
7741 +       do {                                                            \
7742 +               int __cpu;                                              \
7743 +               for_each_possible_cpu(__cpu)                            \
7744 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7745 +       } while (0)
7746 +
7747 +/*
7748 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7749 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7750 + * already takes care of the migrate_disable/enable
7751 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7752 + */
7753 +#ifdef CONFIG_PREEMPT_RT_FULL
7754 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7755 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7756 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7757 +#else
7758 +# define spin_lock_local(lock)                 spin_lock(lock)
7759 +# define spin_trylock_local(lock)              spin_trylock(lock)
7760 +# define spin_unlock_local(lock)               spin_unlock(lock)
7761 +#endif
7762 +
7763 +static inline void __local_lock(struct local_irq_lock *lv)
7764 +{
7765 +       if (lv->owner != current) {
7766 +               spin_lock_local(&lv->lock);
7767 +               LL_WARN(lv->owner);
7768 +               LL_WARN(lv->nestcnt);
7769 +               lv->owner = current;
7770 +       }
7771 +       lv->nestcnt++;
7772 +}
7773 +
7774 +#define local_lock(lvar)                                       \
7775 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7776 +
7777 +#define local_lock_on(lvar, cpu)                               \
7778 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7779 +
7780 +static inline int __local_trylock(struct local_irq_lock *lv)
7781 +{
7782 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7783 +               LL_WARN(lv->owner);
7784 +               LL_WARN(lv->nestcnt);
7785 +               lv->owner = current;
7786 +               lv->nestcnt = 1;
7787 +               return 1;
7788 +       } else if (lv->owner == current) {
7789 +               lv->nestcnt++;
7790 +               return 1;
7791 +       }
7792 +       return 0;
7793 +}
7794 +
7795 +#define local_trylock(lvar)                                            \
7796 +       ({                                                              \
7797 +               int __locked;                                           \
7798 +               __locked = __local_trylock(&get_local_var(lvar));       \
7799 +               if (!__locked)                                          \
7800 +                       put_local_var(lvar);                            \
7801 +               __locked;                                               \
7802 +       })
7803 +
7804 +static inline void __local_unlock(struct local_irq_lock *lv)
7805 +{
7806 +       LL_WARN(lv->nestcnt == 0);
7807 +       LL_WARN(lv->owner != current);
7808 +       if (--lv->nestcnt)
7809 +               return;
7810 +
7811 +       lv->owner = NULL;
7812 +       spin_unlock_local(&lv->lock);
7813 +}
7814 +
7815 +#define local_unlock(lvar)                                     \
7816 +       do {                                                    \
7817 +               __local_unlock(this_cpu_ptr(&lvar));            \
7818 +               put_local_var(lvar);                            \
7819 +       } while (0)
7820 +
7821 +#define local_unlock_on(lvar, cpu)                       \
7822 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7823 +
7824 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7825 +{
7826 +       spin_lock_irqsave(&lv->lock, lv->flags);
7827 +       LL_WARN(lv->owner);
7828 +       LL_WARN(lv->nestcnt);
7829 +       lv->owner = current;
7830 +       lv->nestcnt = 1;
7831 +}
7832 +
7833 +#define local_lock_irq(lvar)                                           \
7834 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7835 +
7836 +#define local_lock_irq_on(lvar, cpu)                                   \
7837 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7838 +
7839 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7840 +{
7841 +       LL_WARN(!lv->nestcnt);
7842 +       LL_WARN(lv->owner != current);
7843 +       lv->owner = NULL;
7844 +       lv->nestcnt = 0;
7845 +       spin_unlock_irq(&lv->lock);
7846 +}
7847 +
7848 +#define local_unlock_irq(lvar)                                         \
7849 +       do {                                                            \
7850 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7851 +               put_local_var(lvar);                                    \
7852 +       } while (0)
7853 +
7854 +#define local_unlock_irq_on(lvar, cpu)                                 \
7855 +       do {                                                            \
7856 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7857 +       } while (0)
7858 +
7859 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7860 +{
7861 +       if (lv->owner != current) {
7862 +               __local_lock_irq(lv);
7863 +               return 0;
7864 +       } else {
7865 +               lv->nestcnt++;
7866 +               return 1;
7867 +       }
7868 +}
7869 +
7870 +#define local_lock_irqsave(lvar, _flags)                               \
7871 +       do {                                                            \
7872 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7873 +                       put_local_var(lvar);                            \
7874 +               _flags = __this_cpu_read(lvar.flags);                   \
7875 +       } while (0)
7876 +
7877 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7878 +       do {                                                            \
7879 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7880 +               _flags = per_cpu(lvar, cpu).flags;                      \
7881 +       } while (0)
7882 +
7883 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7884 +                                           unsigned long flags)
7885 +{
7886 +       LL_WARN(!lv->nestcnt);
7887 +       LL_WARN(lv->owner != current);
7888 +       if (--lv->nestcnt)
7889 +               return 0;
7890 +
7891 +       lv->owner = NULL;
7892 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7893 +       return 1;
7894 +}
7895 +
7896 +#define local_unlock_irqrestore(lvar, flags)                           \
7897 +       do {                                                            \
7898 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7899 +                       put_local_var(lvar);                            \
7900 +       } while (0)
7901 +
7902 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7903 +       do {                                                            \
7904 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7905 +       } while (0)
7906 +
7907 +#define local_spin_trylock_irq(lvar, lock)                             \
7908 +       ({                                                              \
7909 +               int __locked;                                           \
7910 +               local_lock_irq(lvar);                                   \
7911 +               __locked = spin_trylock(lock);                          \
7912 +               if (!__locked)                                          \
7913 +                       local_unlock_irq(lvar);                         \
7914 +               __locked;                                               \
7915 +       })
7916 +
7917 +#define local_spin_lock_irq(lvar, lock)                                        \
7918 +       do {                                                            \
7919 +               local_lock_irq(lvar);                                   \
7920 +               spin_lock(lock);                                        \
7921 +       } while (0)
7922 +
7923 +#define local_spin_unlock_irq(lvar, lock)                              \
7924 +       do {                                                            \
7925 +               spin_unlock(lock);                                      \
7926 +               local_unlock_irq(lvar);                                 \
7927 +       } while (0)
7928 +
7929 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7930 +       do {                                                            \
7931 +               local_lock_irqsave(lvar, flags);                        \
7932 +               spin_lock(lock);                                        \
7933 +       } while (0)
7934 +
7935 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7936 +       do {                                                            \
7937 +               spin_unlock(lock);                                      \
7938 +               local_unlock_irqrestore(lvar, flags);                   \
7939 +       } while (0)
7940 +
7941 +#define get_locked_var(lvar, var)                                      \
7942 +       (*({                                                            \
7943 +               local_lock(lvar);                                       \
7944 +               this_cpu_ptr(&var);                                     \
7945 +       }))
7946 +
7947 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7948 +
7949 +#define local_lock_cpu(lvar)                                           \
7950 +       ({                                                              \
7951 +               local_lock(lvar);                                       \
7952 +               smp_processor_id();                                     \
7953 +       })
7954 +
7955 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7956 +
7957 +#else /* PREEMPT_RT_BASE */
7958 +
7959 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7960 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7961 +
7962 +static inline void local_irq_lock_init(int lvar) { }
7963 +
7964 +#define local_trylock(lvar)                                    \
7965 +       ({                                                      \
7966 +               preempt_disable();                              \
7967 +               1;                                              \
7968 +       })
7969 +
7970 +#define local_lock(lvar)                       preempt_disable()
7971 +#define local_unlock(lvar)                     preempt_enable()
7972 +#define local_lock_irq(lvar)                   local_irq_disable()
7973 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7974 +#define local_unlock_irq(lvar)                 local_irq_enable()
7975 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7976 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7977 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7978 +
7979 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7980 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7981 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7982 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7983 +       spin_lock_irqsave(lock, flags)
7984 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7985 +       spin_unlock_irqrestore(lock, flags)
7986 +
7987 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7988 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7989 +
7990 +#define local_lock_cpu(lvar)                   get_cpu()
7991 +#define local_unlock_cpu(lvar)                 put_cpu()
7992 +
7993 +#endif
7994 +
7995 +#endif
7996 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
7997 index e8471c2ca83a..08bde1a7a987 100644
7998 --- a/include/linux/mm_types.h
7999 +++ b/include/linux/mm_types.h
8000 @@ -11,6 +11,7 @@
8001  #include <linux/completion.h>
8002  #include <linux/cpumask.h>
8003  #include <linux/uprobes.h>
8004 +#include <linux/rcupdate.h>
8005  #include <linux/page-flags-layout.h>
8006  #include <linux/workqueue.h>
8007  #include <asm/page.h>
8008 @@ -513,6 +514,9 @@ struct mm_struct {
8009         bool tlb_flush_batched;
8010  #endif
8011         struct uprobes_state uprobes_state;
8012 +#ifdef CONFIG_PREEMPT_RT_BASE
8013 +       struct rcu_head delayed_drop;
8014 +#endif
8015  #ifdef CONFIG_X86_INTEL_MPX
8016         /* address of the bounds directory */
8017         void __user *bd_addr;
8018 diff --git a/include/linux/module.h b/include/linux/module.h
8019 index 0c3207d26ac0..5944baaa3f28 100644
8020 --- a/include/linux/module.h
8021 +++ b/include/linux/module.h
8022 @@ -496,6 +496,7 @@ static inline int module_is_live(struct module *mod)
8023  struct module *__module_text_address(unsigned long addr);
8024  struct module *__module_address(unsigned long addr);
8025  bool is_module_address(unsigned long addr);
8026 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
8027  bool is_module_percpu_address(unsigned long addr);
8028  bool is_module_text_address(unsigned long addr);
8029
8030 @@ -663,6 +664,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
8031         return false;
8032  }
8033
8034 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
8035 +{
8036 +       return false;
8037 +}
8038 +
8039  static inline bool is_module_text_address(unsigned long addr)
8040  {
8041         return false;
8042 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
8043 index 2cb7531e7d7a..b3fdfc820216 100644
8044 --- a/include/linux/mutex.h
8045 +++ b/include/linux/mutex.h
8046 @@ -19,6 +19,17 @@
8047  #include <asm/processor.h>
8048  #include <linux/osq_lock.h>
8049
8050 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8051 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
8052 +       , .dep_map = { .name = #lockname }
8053 +#else
8054 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
8055 +#endif
8056 +
8057 +#ifdef CONFIG_PREEMPT_RT_FULL
8058 +# include <linux/mutex_rt.h>
8059 +#else
8060 +
8061  /*
8062   * Simple, straightforward mutexes with strict semantics:
8063   *
8064 @@ -99,13 +110,6 @@ do {                                                        \
8065  static inline void mutex_destroy(struct mutex *lock) {}
8066  #endif
8067
8068 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
8069 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
8070 -               , .dep_map = { .name = #lockname }
8071 -#else
8072 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
8073 -#endif
8074 -
8075  #define __MUTEX_INITIALIZER(lockname) \
8076                 { .count = ATOMIC_INIT(1) \
8077                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
8078 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
8079  extern int mutex_trylock(struct mutex *lock);
8080  extern void mutex_unlock(struct mutex *lock);
8081
8082 +#endif /* !PREEMPT_RT_FULL */
8083 +
8084  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
8085
8086  #endif /* __LINUX_MUTEX_H */
8087 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
8088 new file mode 100644
8089 index 000000000000..e0284edec655
8090 --- /dev/null
8091 +++ b/include/linux/mutex_rt.h
8092 @@ -0,0 +1,89 @@
8093 +#ifndef __LINUX_MUTEX_RT_H
8094 +#define __LINUX_MUTEX_RT_H
8095 +
8096 +#ifndef __LINUX_MUTEX_H
8097 +#error "Please include mutex.h"
8098 +#endif
8099 +
8100 +#include <linux/rtmutex.h>
8101 +
8102 +/* FIXME: Just for __lockfunc */
8103 +#include <linux/spinlock.h>
8104 +
8105 +struct mutex {
8106 +       struct rt_mutex         lock;
8107 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8108 +       struct lockdep_map      dep_map;
8109 +#endif
8110 +};
8111 +
8112 +#define __MUTEX_INITIALIZER(mutexname)                                 \
8113 +       {                                                               \
8114 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
8115 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
8116 +       }
8117 +
8118 +#define DEFINE_MUTEX(mutexname)                                                \
8119 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
8120 +
8121 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
8122 +extern void __lockfunc _mutex_lock(struct mutex *lock);
8123 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
8124 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
8125 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
8126 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
8127 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
8128 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
8129 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
8130 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
8131 +
8132 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
8133 +#define mutex_lock(l)                  _mutex_lock(l)
8134 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
8135 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
8136 +#define mutex_trylock(l)               _mutex_trylock(l)
8137 +#define mutex_unlock(l)                        _mutex_unlock(l)
8138 +
8139 +#ifdef CONFIG_DEBUG_MUTEXES
8140 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
8141 +#else
8142 +static inline void mutex_destroy(struct mutex *lock) {}
8143 +#endif
8144 +
8145 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8146 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
8147 +# define mutex_lock_interruptible_nested(l, s) \
8148 +                                       _mutex_lock_interruptible_nested(l, s)
8149 +# define mutex_lock_killable_nested(l, s) \
8150 +                                       _mutex_lock_killable_nested(l, s)
8151 +
8152 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
8153 +do {                                                                   \
8154 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
8155 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
8156 +} while (0)
8157 +
8158 +#else
8159 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
8160 +# define mutex_lock_interruptible_nested(l, s) \
8161 +                                       _mutex_lock_interruptible(l)
8162 +# define mutex_lock_killable_nested(l, s) \
8163 +                                       _mutex_lock_killable(l)
8164 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
8165 +#endif
8166 +
8167 +# define mutex_init(mutex)                             \
8168 +do {                                                   \
8169 +       static struct lock_class_key __key;             \
8170 +                                                       \
8171 +       rt_mutex_init(&(mutex)->lock);                  \
8172 +       __mutex_do_init((mutex), #mutex, &__key);       \
8173 +} while (0)
8174 +
8175 +# define __mutex_init(mutex, name, key)                        \
8176 +do {                                                   \
8177 +       rt_mutex_init(&(mutex)->lock);                  \
8178 +       __mutex_do_init((mutex), name, key);            \
8179 +} while (0)
8180 +
8181 +#endif
8182 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
8183 index 47c7f5b8f675..85fc72b8a92b 100644
8184 --- a/include/linux/netdevice.h
8185 +++ b/include/linux/netdevice.h
8186 @@ -396,7 +396,19 @@ typedef enum rx_handler_result rx_handler_result_t;
8187  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
8188
8189  void __napi_schedule(struct napi_struct *n);
8190 +
8191 +/*
8192 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
8193 + * run as threads, and they can also be preempted (without PREEMPT_RT
8194 + * interrupt threads can not be preempted). Which means that calling
8195 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
8196 + * and can corrupt the napi->poll_list.
8197 + */
8198 +#ifdef CONFIG_PREEMPT_RT_FULL
8199 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
8200 +#else
8201  void __napi_schedule_irqoff(struct napi_struct *n);
8202 +#endif
8203
8204  static inline bool napi_disable_pending(struct napi_struct *n)
8205  {
8206 @@ -2464,14 +2476,53 @@ void netdev_freemem(struct net_device *dev);
8207  void synchronize_net(void);
8208  int init_dummy_netdev(struct net_device *dev);
8209
8210 -DECLARE_PER_CPU(int, xmit_recursion);
8211  #define XMIT_RECURSION_LIMIT   10
8212 +#ifdef CONFIG_PREEMPT_RT_FULL
8213 +static inline int dev_recursion_level(void)
8214 +{
8215 +       return current->xmit_recursion;
8216 +}
8217 +
8218 +static inline int xmit_rec_read(void)
8219 +{
8220 +       return current->xmit_recursion;
8221 +}
8222 +
8223 +static inline void xmit_rec_inc(void)
8224 +{
8225 +       current->xmit_recursion++;
8226 +}
8227 +
8228 +static inline void xmit_rec_dec(void)
8229 +{
8230 +       current->xmit_recursion--;
8231 +}
8232 +
8233 +#else
8234 +
8235 +DECLARE_PER_CPU(int, xmit_recursion);
8236
8237  static inline int dev_recursion_level(void)
8238  {
8239         return this_cpu_read(xmit_recursion);
8240  }
8241
8242 +static inline int xmit_rec_read(void)
8243 +{
8244 +       return __this_cpu_read(xmit_recursion);
8245 +}
8246 +
8247 +static inline void xmit_rec_inc(void)
8248 +{
8249 +       __this_cpu_inc(xmit_recursion);
8250 +}
8251 +
8252 +static inline void xmit_rec_dec(void)
8253 +{
8254 +       __this_cpu_dec(xmit_recursion);
8255 +}
8256 +#endif
8257 +
8258  struct net_device *dev_get_by_index(struct net *net, int ifindex);
8259  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
8260  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
8261 @@ -2856,6 +2907,7 @@ struct softnet_data {
8262         unsigned int            dropped;
8263         struct sk_buff_head     input_pkt_queue;
8264         struct napi_struct      backlog;
8265 +       struct sk_buff_head     tofree_queue;
8266
8267  };
8268
8269 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
8270 index 2ad1a2b289b5..b4d10155af54 100644
8271 --- a/include/linux/netfilter/x_tables.h
8272 +++ b/include/linux/netfilter/x_tables.h
8273 @@ -4,6 +4,7 @@
8274
8275  #include <linux/netdevice.h>
8276  #include <linux/static_key.h>
8277 +#include <linux/locallock.h>
8278  #include <uapi/linux/netfilter/x_tables.h>
8279
8280  /* Test a struct->invflags and a boolean for inequality */
8281 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
8282   */
8283  DECLARE_PER_CPU(seqcount_t, xt_recseq);
8284
8285 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
8286 +
8287  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
8288   *
8289   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
8290 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
8291  {
8292         unsigned int addend;
8293
8294 +       /* RT protection */
8295 +       local_lock(xt_write_lock);
8296 +
8297         /*
8298          * Low order bit of sequence is set if we already
8299          * called xt_write_recseq_begin().
8300 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
8301         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
8302         smp_wmb();
8303         __this_cpu_add(xt_recseq.sequence, addend);
8304 +       local_unlock(xt_write_lock);
8305  }
8306
8307  /*
8308 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
8309 index 810124b33327..d54ca43d571f 100644
8310 --- a/include/linux/nfs_fs.h
8311 +++ b/include/linux/nfs_fs.h
8312 @@ -165,7 +165,11 @@ struct nfs_inode {
8313
8314         /* Readers: in-flight sillydelete RPC calls */
8315         /* Writers: rmdir */
8316 +#ifdef CONFIG_PREEMPT_RT_BASE
8317 +       struct semaphore        rmdir_sem;
8318 +#else
8319         struct rw_semaphore     rmdir_sem;
8320 +#endif
8321
8322  #if IS_ENABLED(CONFIG_NFS_V4)
8323         struct nfs4_cached_acl  *nfs4_acl;
8324 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
8325 index 3bf867a0c3b3..71c6bdd14c8a 100644
8326 --- a/include/linux/nfs_xdr.h
8327 +++ b/include/linux/nfs_xdr.h
8328 @@ -1490,7 +1490,7 @@ struct nfs_unlinkdata {
8329         struct nfs_removeargs args;
8330         struct nfs_removeres res;
8331         struct dentry *dentry;
8332 -       wait_queue_head_t wq;
8333 +       struct swait_queue_head wq;
8334         struct rpc_cred *cred;
8335         struct nfs_fattr dir_attr;
8336         long timeout;
8337 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
8338 index 4149868de4e6..babe5b9bcb91 100644
8339 --- a/include/linux/notifier.h
8340 +++ b/include/linux/notifier.h
8341 @@ -6,7 +6,7 @@
8342   *
8343   *                             Alan Cox <Alan.Cox@linux.org>
8344   */
8345 -
8346 +
8347  #ifndef _LINUX_NOTIFIER_H
8348  #define _LINUX_NOTIFIER_H
8349  #include <linux/errno.h>
8350 @@ -42,9 +42,7 @@
8351   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
8352   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
8353   * SRCU notifier chains should be used when the chain will be called very
8354 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
8355 - * chains are slightly more difficult to use because they require special
8356 - * runtime initialization.
8357 + * often but notifier_blocks will seldom be removed.
8358   */
8359
8360  struct notifier_block;
8361 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
8362                 (name)->head = NULL;            \
8363         } while (0)
8364
8365 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
8366 +/* srcu_notifier_heads must be cleaned up dynamically */
8367  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8368  #define srcu_cleanup_notifier_head(name)       \
8369                 cleanup_srcu_struct(&(name)->srcu);
8370 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8371                 .head = NULL }
8372  #define RAW_NOTIFIER_INIT(name)        {                               \
8373                 .head = NULL }
8374 -/* srcu_notifier_heads cannot be initialized statically */
8375 +
8376 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
8377 +       {                                                       \
8378 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
8379 +               .head = NULL,                                   \
8380 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
8381 +       }
8382
8383  #define ATOMIC_NOTIFIER_HEAD(name)                             \
8384         struct atomic_notifier_head name =                      \
8385 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8386         struct raw_notifier_head name =                         \
8387                 RAW_NOTIFIER_INIT(name)
8388
8389 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
8390 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
8391 +                       name##_head_srcu_array);                \
8392 +       mod struct srcu_notifier_head name =                    \
8393 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
8394 +
8395 +#define SRCU_NOTIFIER_HEAD(name)                               \
8396 +       _SRCU_NOTIFIER_HEAD(name, )
8397 +
8398 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
8399 +       _SRCU_NOTIFIER_HEAD(name, static)
8400 +
8401  #ifdef __KERNEL__
8402
8403  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
8404 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
8405
8406  /*
8407   *     Declared notifiers so far. I can imagine quite a few more chains
8408 - *     over time (eg laptop power reset chains, reboot chain (to clean
8409 + *     over time (eg laptop power reset chains, reboot chain (to clean
8410   *     device units up), device [un]mount chain, module load/unload chain,
8411 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
8412 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
8413   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
8414   */
8415 -
8416 +
8417  /* CPU notfiers are defined in include/linux/cpu.h. */
8418
8419  /* netdevice notifiers are defined in include/linux/netdevice.h */
8420 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
8421 index 5b2e6159b744..ea940f451606 100644
8422 --- a/include/linux/percpu-rwsem.h
8423 +++ b/include/linux/percpu-rwsem.h
8424 @@ -4,7 +4,7 @@
8425  #include <linux/atomic.h>
8426  #include <linux/rwsem.h>
8427  #include <linux/percpu.h>
8428 -#include <linux/wait.h>
8429 +#include <linux/swait.h>
8430  #include <linux/rcu_sync.h>
8431  #include <linux/lockdep.h>
8432
8433 @@ -12,7 +12,7 @@ struct percpu_rw_semaphore {
8434         struct rcu_sync         rss;
8435         unsigned int __percpu   *read_count;
8436         struct rw_semaphore     rw_sem;
8437 -       wait_queue_head_t       writer;
8438 +       struct swait_queue_head writer;
8439         int                     readers_block;
8440  };
8441
8442 @@ -22,13 +22,13 @@ static struct percpu_rw_semaphore name = {                          \
8443         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
8444         .read_count = &__percpu_rwsem_rc_##name,                        \
8445         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
8446 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
8447 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
8448  }
8449
8450  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
8451  extern void __percpu_up_read(struct percpu_rw_semaphore *);
8452
8453 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
8454 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8455  {
8456         might_sleep();
8457
8458 @@ -46,16 +46,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
8459         __this_cpu_inc(*sem->read_count);
8460         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
8461                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
8462 -       barrier();
8463         /*
8464 -        * The barrier() prevents the compiler from
8465 +        * The preempt_enable() prevents the compiler from
8466          * bleeding the critical section out.
8467          */
8468 -}
8469 -
8470 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8471 -{
8472 -       percpu_down_read_preempt_disable(sem);
8473         preempt_enable();
8474  }
8475
8476 @@ -82,13 +76,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
8477         return ret;
8478  }
8479
8480 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
8481 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8482  {
8483 -       /*
8484 -        * The barrier() prevents the compiler from
8485 -        * bleeding the critical section out.
8486 -        */
8487 -       barrier();
8488 +       preempt_disable();
8489         /*
8490          * Same as in percpu_down_read().
8491          */
8492 @@ -101,12 +91,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
8493         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
8494  }
8495
8496 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8497 -{
8498 -       preempt_disable();
8499 -       percpu_up_read_preempt_enable(sem);
8500 -}
8501 -
8502  extern void percpu_down_write(struct percpu_rw_semaphore *);
8503  extern void percpu_up_write(struct percpu_rw_semaphore *);
8504
8505 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
8506 index 56939d3f6e53..b988bf40ad3e 100644
8507 --- a/include/linux/percpu.h
8508 +++ b/include/linux/percpu.h
8509 @@ -18,6 +18,35 @@
8510  #define PERCPU_MODULE_RESERVE          0
8511  #endif
8512
8513 +#ifdef CONFIG_PREEMPT_RT_FULL
8514 +
8515 +#define get_local_var(var) (*({        \
8516 +       migrate_disable();      \
8517 +       this_cpu_ptr(&var);     }))
8518 +
8519 +#define put_local_var(var) do {        \
8520 +       (void)&(var);           \
8521 +       migrate_enable();       \
8522 +} while (0)
8523 +
8524 +# define get_local_ptr(var) ({ \
8525 +       migrate_disable();      \
8526 +       this_cpu_ptr(var);      })
8527 +
8528 +# define put_local_ptr(var) do {       \
8529 +       (void)(var);                    \
8530 +       migrate_enable();               \
8531 +} while (0)
8532 +
8533 +#else
8534 +
8535 +#define get_local_var(var)     get_cpu_var(var)
8536 +#define put_local_var(var)     put_cpu_var(var)
8537 +#define get_local_ptr(var)     get_cpu_ptr(var)
8538 +#define put_local_ptr(var)     put_cpu_ptr(var)
8539 +
8540 +#endif
8541 +
8542  /* minimum unit size, also is the maximum supported allocation size */
8543  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
8544
8545 @@ -110,6 +139,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
8546  #endif
8547
8548  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
8549 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
8550  extern bool is_kernel_percpu_address(unsigned long addr);
8551
8552  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
8553 diff --git a/include/linux/pid.h b/include/linux/pid.h
8554 index 97b745ddece5..01a5460a0c85 100644
8555 --- a/include/linux/pid.h
8556 +++ b/include/linux/pid.h
8557 @@ -2,6 +2,7 @@
8558  #define _LINUX_PID_H
8559
8560  #include <linux/rcupdate.h>
8561 +#include <linux/atomic.h>
8562
8563  enum pid_type
8564  {
8565 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
8566 index 7eeceac52dea..f97c54265904 100644
8567 --- a/include/linux/preempt.h
8568 +++ b/include/linux/preempt.h
8569 @@ -50,7 +50,11 @@
8570  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8571  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8572
8573 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8574 +#ifndef CONFIG_PREEMPT_RT_FULL
8575 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8576 +#else
8577 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8578 +#endif
8579
8580  /* We use the MSB mostly because its available */
8581  #define PREEMPT_NEED_RESCHED   0x80000000
8582 @@ -59,9 +63,15 @@
8583  #include <asm/preempt.h>
8584
8585  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8586 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8587  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8588                                  | NMI_MASK))
8589 +#ifndef CONFIG_PREEMPT_RT_FULL
8590 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8591 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8592 +#else
8593 +# define softirq_count()       (0UL)
8594 +extern int in_serving_softirq(void);
8595 +#endif
8596
8597  /*
8598   * Are we doing bottom half or hardware interrupt processing?
8599 @@ -79,7 +89,6 @@
8600  #define in_irq()               (hardirq_count())
8601  #define in_softirq()           (softirq_count())
8602  #define in_interrupt()         (irq_count())
8603 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8604  #define in_nmi()               (preempt_count() & NMI_MASK)
8605  #define in_task()              (!(preempt_count() & \
8606                                    (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
8607 @@ -96,7 +105,11 @@
8608  /*
8609   * The preempt_count offset after spin_lock()
8610   */
8611 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8612  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8613 +#else
8614 +#define PREEMPT_LOCK_OFFSET    0
8615 +#endif
8616
8617  /*
8618   * The preempt_count offset needed for things like:
8619 @@ -145,6 +158,20 @@ extern void preempt_count_sub(int val);
8620  #define preempt_count_inc() preempt_count_add(1)
8621  #define preempt_count_dec() preempt_count_sub(1)
8622
8623 +#ifdef CONFIG_PREEMPT_LAZY
8624 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8625 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8626 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8627 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8628 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8629 +#else
8630 +#define add_preempt_lazy_count(val)    do { } while (0)
8631 +#define sub_preempt_lazy_count(val)    do { } while (0)
8632 +#define inc_preempt_lazy_count()       do { } while (0)
8633 +#define dec_preempt_lazy_count()       do { } while (0)
8634 +#define preempt_lazy_count()           (0)
8635 +#endif
8636 +
8637  #ifdef CONFIG_PREEMPT_COUNT
8638
8639  #define preempt_disable() \
8640 @@ -153,13 +180,25 @@ do { \
8641         barrier(); \
8642  } while (0)
8643
8644 +#define preempt_lazy_disable() \
8645 +do { \
8646 +       inc_preempt_lazy_count(); \
8647 +       barrier(); \
8648 +} while (0)
8649 +
8650  #define sched_preempt_enable_no_resched() \
8651  do { \
8652         barrier(); \
8653         preempt_count_dec(); \
8654  } while (0)
8655
8656 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8657 +#ifdef CONFIG_PREEMPT_RT_BASE
8658 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8659 +# define preempt_check_resched_rt() preempt_check_resched()
8660 +#else
8661 +# define preempt_enable_no_resched() preempt_enable()
8662 +# define preempt_check_resched_rt() barrier();
8663 +#endif
8664
8665  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8666
8667 @@ -184,6 +223,13 @@ do { \
8668                 __preempt_schedule(); \
8669  } while (0)
8670
8671 +#define preempt_lazy_enable() \
8672 +do { \
8673 +       dec_preempt_lazy_count(); \
8674 +       barrier(); \
8675 +       preempt_check_resched(); \
8676 +} while (0)
8677 +
8678  #else /* !CONFIG_PREEMPT */
8679  #define preempt_enable() \
8680  do { \
8681 @@ -229,6 +275,7 @@ do { \
8682  #define preempt_disable_notrace()              barrier()
8683  #define preempt_enable_no_resched_notrace()    barrier()
8684  #define preempt_enable_notrace()               barrier()
8685 +#define preempt_check_resched_rt()             barrier()
8686  #define preemptible()                          0
8687
8688  #endif /* CONFIG_PREEMPT_COUNT */
8689 @@ -249,10 +296,31 @@ do { \
8690  } while (0)
8691  #define preempt_fold_need_resched() \
8692  do { \
8693 -       if (tif_need_resched()) \
8694 +       if (tif_need_resched_now()) \
8695                 set_preempt_need_resched(); \
8696  } while (0)
8697
8698 +#ifdef CONFIG_PREEMPT_RT_FULL
8699 +# define preempt_disable_rt()          preempt_disable()
8700 +# define preempt_enable_rt()           preempt_enable()
8701 +# define preempt_disable_nort()                barrier()
8702 +# define preempt_enable_nort()         barrier()
8703 +# ifdef CONFIG_SMP
8704 +   extern void migrate_disable(void);
8705 +   extern void migrate_enable(void);
8706 +# else /* CONFIG_SMP */
8707 +#  define migrate_disable()            barrier()
8708 +#  define migrate_enable()             barrier()
8709 +# endif /* CONFIG_SMP */
8710 +#else
8711 +# define preempt_disable_rt()          barrier()
8712 +# define preempt_enable_rt()           barrier()
8713 +# define preempt_disable_nort()                preempt_disable()
8714 +# define preempt_enable_nort()         preempt_enable()
8715 +# define migrate_disable()             preempt_disable()
8716 +# define migrate_enable()              preempt_enable()
8717 +#endif
8718 +
8719  #ifdef CONFIG_PREEMPT_NOTIFIERS
8720
8721  struct preempt_notifier;
8722 diff --git a/include/linux/printk.h b/include/linux/printk.h
8723 index eac1af8502bb..37e647af0b0b 100644
8724 --- a/include/linux/printk.h
8725 +++ b/include/linux/printk.h
8726 @@ -126,9 +126,11 @@ struct va_format {
8727  #ifdef CONFIG_EARLY_PRINTK
8728  extern asmlinkage __printf(1, 2)
8729  void early_printk(const char *fmt, ...);
8730 +extern void printk_kill(void);
8731  #else
8732  static inline __printf(1, 2) __cold
8733  void early_printk(const char *s, ...) { }
8734 +static inline void printk_kill(void) { }
8735  #endif
8736
8737  #ifdef CONFIG_PRINTK_NMI
8738 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
8739 index af3581b8a451..277295039c8f 100644
8740 --- a/include/linux/radix-tree.h
8741 +++ b/include/linux/radix-tree.h
8742 @@ -292,6 +292,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
8743  int radix_tree_preload(gfp_t gfp_mask);
8744  int radix_tree_maybe_preload(gfp_t gfp_mask);
8745  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8746 +void radix_tree_preload_end(void);
8747 +
8748  void radix_tree_init(void);
8749  void *radix_tree_tag_set(struct radix_tree_root *root,
8750                         unsigned long index, unsigned int tag);
8751 @@ -314,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
8752  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
8753  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8754
8755 -static inline void radix_tree_preload_end(void)
8756 -{
8757 -       preempt_enable();
8758 -}
8759 -
8760  /**
8761   * struct radix_tree_iter - radix tree iterator state
8762   *
8763 diff --git a/include/linux/random.h b/include/linux/random.h
8764 index 16ab429735a7..9d0fecb5b6c2 100644
8765 --- a/include/linux/random.h
8766 +++ b/include/linux/random.h
8767 @@ -31,7 +31,7 @@ static inline void add_latent_entropy(void) {}
8768
8769  extern void add_input_randomness(unsigned int type, unsigned int code,
8770                                  unsigned int value) __latent_entropy;
8771 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8772 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8773
8774  extern void get_random_bytes(void *buf, int nbytes);
8775  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8776 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
8777 index e585018498d5..25c64474fc27 100644
8778 --- a/include/linux/rbtree.h
8779 +++ b/include/linux/rbtree.h
8780 @@ -31,7 +31,7 @@
8781
8782  #include <linux/kernel.h>
8783  #include <linux/stddef.h>
8784 -#include <linux/rcupdate.h>
8785 +#include <linux/rcu_assign_pointer.h>
8786
8787  struct rb_node {
8788         unsigned long  __rb_parent_color;
8789 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
8790 index d076183e49be..36bfb4dd57ae 100644
8791 --- a/include/linux/rbtree_augmented.h
8792 +++ b/include/linux/rbtree_augmented.h
8793 @@ -26,6 +26,7 @@
8794
8795  #include <linux/compiler.h>
8796  #include <linux/rbtree.h>
8797 +#include <linux/rcupdate.h>
8798
8799  /*
8800   * Please note - only struct rb_augment_callbacks and the prototypes for
8801 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
8802 new file mode 100644
8803 index 000000000000..7066962a4379
8804 --- /dev/null
8805 +++ b/include/linux/rcu_assign_pointer.h
8806 @@ -0,0 +1,54 @@
8807 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8808 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8809 +#include <linux/compiler.h>
8810 +#include <asm/barrier.h>
8811 +
8812 +/**
8813 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8814 + * @v: The value to statically initialize with.
8815 + */
8816 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8817 +
8818 +/**
8819 + * rcu_assign_pointer() - assign to RCU-protected pointer
8820 + * @p: pointer to assign to
8821 + * @v: value to assign (publish)
8822 + *
8823 + * Assigns the specified value to the specified RCU-protected
8824 + * pointer, ensuring that any concurrent RCU readers will see
8825 + * any prior initialization.
8826 + *
8827 + * Inserts memory barriers on architectures that require them
8828 + * (which is most of them), and also prevents the compiler from
8829 + * reordering the code that initializes the structure after the pointer
8830 + * assignment.  More importantly, this call documents which pointers
8831 + * will be dereferenced by RCU read-side code.
8832 + *
8833 + * In some special cases, you may use RCU_INIT_POINTER() instead
8834 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8835 + * to the fact that it does not constrain either the CPU or the compiler.
8836 + * That said, using RCU_INIT_POINTER() when you should have used
8837 + * rcu_assign_pointer() is a very bad thing that results in
8838 + * impossible-to-diagnose memory corruption.  So please be careful.
8839 + * See the RCU_INIT_POINTER() comment header for details.
8840 + *
8841 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8842 + * once, appearances notwithstanding.  One of the "extra" evaluations
8843 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8844 + * neither of which actually execute the argument.  As with most cpp
8845 + * macros, this execute-arguments-only-once property is important, so
8846 + * please be careful when making changes to rcu_assign_pointer() and the
8847 + * other macros that it invokes.
8848 + */
8849 +#define rcu_assign_pointer(p, v)                                             \
8850 +({                                                                           \
8851 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8852 +                                                                             \
8853 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8854 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8855 +       else                                                                  \
8856 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8857 +       _r_a_p__v;                                                            \
8858 +})
8859 +
8860 +#endif
8861 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
8862 index 01f71e1d2e94..30cc001d0d5a 100644
8863 --- a/include/linux/rcupdate.h
8864 +++ b/include/linux/rcupdate.h
8865 @@ -46,6 +46,7 @@
8866  #include <linux/compiler.h>
8867  #include <linux/ktime.h>
8868  #include <linux/irqflags.h>
8869 +#include <linux/rcu_assign_pointer.h>
8870
8871  #include <asm/barrier.h>
8872
8873 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
8874
8875  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8876
8877 +#ifdef CONFIG_PREEMPT_RT_FULL
8878 +#define call_rcu_bh    call_rcu
8879 +#else
8880  /**
8881   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8882   * @head: structure to be used for queueing the RCU updates.
8883 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
8884   */
8885  void call_rcu_bh(struct rcu_head *head,
8886                  rcu_callback_t func);
8887 +#endif
8888
8889  /**
8890   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8891 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
8892   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8893   */
8894  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8895 +#ifndef CONFIG_PREEMPT_RT_FULL
8896 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8897 +#else
8898 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8899 +#endif
8900
8901  #else /* #ifdef CONFIG_PREEMPT_RCU */
8902
8903 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
8904         return 0;
8905  }
8906
8907 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8908 +
8909  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8910
8911  /* Internal to kernel */
8912 @@ -505,7 +517,14 @@ extern struct lockdep_map rcu_callback_map;
8913  int debug_lockdep_rcu_enabled(void);
8914
8915  int rcu_read_lock_held(void);
8916 +#ifdef CONFIG_PREEMPT_RT_FULL
8917 +static inline int rcu_read_lock_bh_held(void)
8918 +{
8919 +       return rcu_read_lock_held();
8920 +}
8921 +#else
8922  int rcu_read_lock_bh_held(void);
8923 +#endif
8924
8925  /**
8926   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8927 @@ -626,54 +645,6 @@ static inline void rcu_preempt_sleep_check(void)
8928  })
8929
8930  /**
8931 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8932 - * @v: The value to statically initialize with.
8933 - */
8934 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8935 -
8936 -/**
8937 - * rcu_assign_pointer() - assign to RCU-protected pointer
8938 - * @p: pointer to assign to
8939 - * @v: value to assign (publish)
8940 - *
8941 - * Assigns the specified value to the specified RCU-protected
8942 - * pointer, ensuring that any concurrent RCU readers will see
8943 - * any prior initialization.
8944 - *
8945 - * Inserts memory barriers on architectures that require them
8946 - * (which is most of them), and also prevents the compiler from
8947 - * reordering the code that initializes the structure after the pointer
8948 - * assignment.  More importantly, this call documents which pointers
8949 - * will be dereferenced by RCU read-side code.
8950 - *
8951 - * In some special cases, you may use RCU_INIT_POINTER() instead
8952 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8953 - * to the fact that it does not constrain either the CPU or the compiler.
8954 - * That said, using RCU_INIT_POINTER() when you should have used
8955 - * rcu_assign_pointer() is a very bad thing that results in
8956 - * impossible-to-diagnose memory corruption.  So please be careful.
8957 - * See the RCU_INIT_POINTER() comment header for details.
8958 - *
8959 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8960 - * once, appearances notwithstanding.  One of the "extra" evaluations
8961 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8962 - * neither of which actually execute the argument.  As with most cpp
8963 - * macros, this execute-arguments-only-once property is important, so
8964 - * please be careful when making changes to rcu_assign_pointer() and the
8965 - * other macros that it invokes.
8966 - */
8967 -#define rcu_assign_pointer(p, v)                                             \
8968 -({                                                                           \
8969 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8970 -                                                                             \
8971 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8972 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8973 -       else                                                                  \
8974 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8975 -       _r_a_p__v;                                                            \
8976 -})
8977 -
8978 -/**
8979   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8980   * @p: The pointer to read
8981   *
8982 @@ -951,10 +922,14 @@ static inline void rcu_read_unlock(void)
8983  static inline void rcu_read_lock_bh(void)
8984  {
8985         local_bh_disable();
8986 +#ifdef CONFIG_PREEMPT_RT_FULL
8987 +       rcu_read_lock();
8988 +#else
8989         __acquire(RCU_BH);
8990         rcu_lock_acquire(&rcu_bh_lock_map);
8991         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8992                          "rcu_read_lock_bh() used illegally while idle");
8993 +#endif
8994  }
8995
8996  /*
8997 @@ -964,10 +939,14 @@ static inline void rcu_read_lock_bh(void)
8998   */
8999  static inline void rcu_read_unlock_bh(void)
9000  {
9001 +#ifdef CONFIG_PREEMPT_RT_FULL
9002 +       rcu_read_unlock();
9003 +#else
9004         RCU_LOCKDEP_WARN(!rcu_is_watching(),
9005                          "rcu_read_unlock_bh() used illegally while idle");
9006         rcu_lock_release(&rcu_bh_lock_map);
9007         __release(RCU_BH);
9008 +#endif
9009         local_bh_enable();
9010  }
9011
9012 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
9013 index 63a4e4cf40a5..08ab12df2863 100644
9014 --- a/include/linux/rcutree.h
9015 +++ b/include/linux/rcutree.h
9016 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
9017         rcu_note_context_switch();
9018  }
9019
9020 +#ifdef CONFIG_PREEMPT_RT_FULL
9021 +# define synchronize_rcu_bh    synchronize_rcu
9022 +#else
9023  void synchronize_rcu_bh(void);
9024 +#endif
9025  void synchronize_sched_expedited(void);
9026  void synchronize_rcu_expedited(void);
9027
9028 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
9029  }
9030
9031  void rcu_barrier(void);
9032 +#ifdef CONFIG_PREEMPT_RT_FULL
9033 +# define rcu_barrier_bh                rcu_barrier
9034 +#else
9035  void rcu_barrier_bh(void);
9036 +#endif
9037  void rcu_barrier_sched(void);
9038  unsigned long get_state_synchronize_rcu(void);
9039  void cond_synchronize_rcu(unsigned long oldstate);
9040 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
9041  extern unsigned long rcutorture_testseq;
9042  extern unsigned long rcutorture_vernum;
9043  unsigned long rcu_batches_started(void);
9044 -unsigned long rcu_batches_started_bh(void);
9045  unsigned long rcu_batches_started_sched(void);
9046  unsigned long rcu_batches_completed(void);
9047 -unsigned long rcu_batches_completed_bh(void);
9048  unsigned long rcu_batches_completed_sched(void);
9049  unsigned long rcu_exp_batches_completed(void);
9050  unsigned long rcu_exp_batches_completed_sched(void);
9051  void show_rcu_gp_kthreads(void);
9052
9053  void rcu_force_quiescent_state(void);
9054 -void rcu_bh_force_quiescent_state(void);
9055  void rcu_sched_force_quiescent_state(void);
9056
9057  void rcu_idle_enter(void);
9058 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
9059
9060  bool rcu_is_watching(void);
9061
9062 +#ifndef CONFIG_PREEMPT_RT_FULL
9063 +void rcu_bh_force_quiescent_state(void);
9064 +unsigned long rcu_batches_started_bh(void);
9065 +unsigned long rcu_batches_completed_bh(void);
9066 +#else
9067 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
9068 +# define rcu_batches_completed_bh      rcu_batches_completed
9069 +# define rcu_batches_started_bh                rcu_batches_completed
9070 +#endif
9071 +
9072  void rcu_all_qs(void);
9073
9074  /* RCUtree hotplug events */
9075 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
9076 index 1abba5ce2a2f..294a8b4875f1 100644
9077 --- a/include/linux/rtmutex.h
9078 +++ b/include/linux/rtmutex.h
9079 @@ -13,11 +13,15 @@
9080  #define __LINUX_RT_MUTEX_H
9081
9082  #include <linux/linkage.h>
9083 +#include <linux/spinlock_types_raw.h>
9084  #include <linux/rbtree.h>
9085 -#include <linux/spinlock_types.h>
9086
9087  extern int max_lock_depth; /* for sysctl */
9088
9089 +#ifdef CONFIG_DEBUG_MUTEXES
9090 +#include <linux/debug_locks.h>
9091 +#endif
9092 +
9093  /**
9094   * The rt_mutex structure
9095   *
9096 @@ -31,8 +35,8 @@ struct rt_mutex {
9097         struct rb_root          waiters;
9098         struct rb_node          *waiters_leftmost;
9099         struct task_struct      *owner;
9100 -#ifdef CONFIG_DEBUG_RT_MUTEXES
9101         int                     save_state;
9102 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9103         const char              *name, *file;
9104         int                     line;
9105         void                    *magic;
9106 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
9107  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
9108  #endif
9109
9110 +# define rt_mutex_init(mutex)                                  \
9111 +       do {                                                    \
9112 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
9113 +               __rt_mutex_init(mutex, #mutex);                 \
9114 +       } while (0)
9115 +
9116  #ifdef CONFIG_DEBUG_RT_MUTEXES
9117  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
9118         , .name = #mutexname, .file = __FILE__, .line = __LINE__
9119 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
9120   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
9121  #else
9122  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
9123 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
9124  # define rt_mutex_debug_task_free(t)                   do { } while (0)
9125  #endif
9126
9127 -#define __RT_MUTEX_INITIALIZER(mutexname) \
9128 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
9129 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
9130 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
9131         , .waiters = RB_ROOT \
9132         , .owner = NULL \
9133 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
9134 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
9135 +
9136 +#define __RT_MUTEX_INITIALIZER(mutexname) \
9137 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
9138 +
9139 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
9140 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
9141 +       , .save_state = 1 }
9142
9143  #define DEFINE_RT_MUTEX(mutexname) \
9144         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
9145 @@ -90,7 +105,9 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
9146  extern void rt_mutex_destroy(struct rt_mutex *lock);
9147
9148  extern void rt_mutex_lock(struct rt_mutex *lock);
9149 +extern int rt_mutex_lock_state(struct rt_mutex *lock, int state);
9150  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
9151 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
9152  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
9153                                struct hrtimer_sleeper *timeout);
9154
9155 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
9156 new file mode 100644
9157 index 000000000000..49ed2d45d3be
9158 --- /dev/null
9159 +++ b/include/linux/rwlock_rt.h
9160 @@ -0,0 +1,99 @@
9161 +#ifndef __LINUX_RWLOCK_RT_H
9162 +#define __LINUX_RWLOCK_RT_H
9163 +
9164 +#ifndef __LINUX_SPINLOCK_H
9165 +#error Do not include directly. Use spinlock.h
9166 +#endif
9167 +
9168 +#define rwlock_init(rwl)                               \
9169 +do {                                                   \
9170 +       static struct lock_class_key __key;             \
9171 +                                                       \
9172 +       rt_mutex_init(&(rwl)->lock);                    \
9173 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
9174 +} while (0)
9175 +
9176 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
9177 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
9178 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
9179 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
9180 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
9181 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
9182 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
9183 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
9184 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
9185 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
9186 +
9187 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
9188 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
9189 +
9190 +#define write_trylock_irqsave(lock, flags)     \
9191 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
9192 +
9193 +#define read_lock_irqsave(lock, flags)                 \
9194 +       do {                                            \
9195 +               typecheck(unsigned long, flags);        \
9196 +               flags = rt_read_lock_irqsave(lock);     \
9197 +       } while (0)
9198 +
9199 +#define write_lock_irqsave(lock, flags)                        \
9200 +       do {                                            \
9201 +               typecheck(unsigned long, flags);        \
9202 +               flags = rt_write_lock_irqsave(lock);    \
9203 +       } while (0)
9204 +
9205 +#define read_lock(lock)                rt_read_lock(lock)
9206 +
9207 +#define read_lock_bh(lock)                             \
9208 +       do {                                            \
9209 +               local_bh_disable();                     \
9210 +               rt_read_lock(lock);                     \
9211 +       } while (0)
9212 +
9213 +#define read_lock_irq(lock)    read_lock(lock)
9214 +
9215 +#define write_lock(lock)       rt_write_lock(lock)
9216 +
9217 +#define write_lock_bh(lock)                            \
9218 +       do {                                            \
9219 +               local_bh_disable();                     \
9220 +               rt_write_lock(lock);                    \
9221 +       } while (0)
9222 +
9223 +#define write_lock_irq(lock)   write_lock(lock)
9224 +
9225 +#define read_unlock(lock)      rt_read_unlock(lock)
9226 +
9227 +#define read_unlock_bh(lock)                           \
9228 +       do {                                            \
9229 +               rt_read_unlock(lock);                   \
9230 +               local_bh_enable();                      \
9231 +       } while (0)
9232 +
9233 +#define read_unlock_irq(lock)  read_unlock(lock)
9234 +
9235 +#define write_unlock(lock)     rt_write_unlock(lock)
9236 +
9237 +#define write_unlock_bh(lock)                          \
9238 +       do {                                            \
9239 +               rt_write_unlock(lock);                  \
9240 +               local_bh_enable();                      \
9241 +       } while (0)
9242 +
9243 +#define write_unlock_irq(lock) write_unlock(lock)
9244 +
9245 +#define read_unlock_irqrestore(lock, flags)            \
9246 +       do {                                            \
9247 +               typecheck(unsigned long, flags);        \
9248 +               (void) flags;                           \
9249 +               rt_read_unlock(lock);                   \
9250 +       } while (0)
9251 +
9252 +#define write_unlock_irqrestore(lock, flags) \
9253 +       do {                                            \
9254 +               typecheck(unsigned long, flags);        \
9255 +               (void) flags;                           \
9256 +               rt_write_unlock(lock);                  \
9257 +       } while (0)
9258 +
9259 +#endif
9260 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
9261 index cc0072e93e36..5317cd957292 100644
9262 --- a/include/linux/rwlock_types.h
9263 +++ b/include/linux/rwlock_types.h
9264 @@ -1,6 +1,10 @@
9265  #ifndef __LINUX_RWLOCK_TYPES_H
9266  #define __LINUX_RWLOCK_TYPES_H
9267
9268 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
9269 +# error "Do not include directly, include spinlock_types.h"
9270 +#endif
9271 +
9272  /*
9273   * include/linux/rwlock_types.h - generic rwlock type definitions
9274   *                               and initializers
9275 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
9276 new file mode 100644
9277 index 000000000000..51b28d775fe1
9278 --- /dev/null
9279 +++ b/include/linux/rwlock_types_rt.h
9280 @@ -0,0 +1,33 @@
9281 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
9282 +#define __LINUX_RWLOCK_TYPES_RT_H
9283 +
9284 +#ifndef __LINUX_SPINLOCK_TYPES_H
9285 +#error "Do not include directly. Include spinlock_types.h instead"
9286 +#endif
9287 +
9288 +/*
9289 + * rwlocks - rtmutex which allows single reader recursion
9290 + */
9291 +typedef struct {
9292 +       struct rt_mutex         lock;
9293 +       int                     read_depth;
9294 +       unsigned int            break_lock;
9295 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9296 +       struct lockdep_map      dep_map;
9297 +#endif
9298 +} rwlock_t;
9299 +
9300 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9301 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
9302 +#else
9303 +# define RW_DEP_MAP_INIT(lockname)
9304 +#endif
9305 +
9306 +#define __RW_LOCK_UNLOCKED(name) \
9307 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
9308 +         RW_DEP_MAP_INIT(name) }
9309 +
9310 +#define DEFINE_RWLOCK(name) \
9311 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
9312 +
9313 +#endif
9314 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
9315 index dd1d14250340..aa2ac1f65c2d 100644
9316 --- a/include/linux/rwsem.h
9317 +++ b/include/linux/rwsem.h
9318 @@ -19,6 +19,10 @@
9319  #include <linux/osq_lock.h>
9320  #endif
9321
9322 +#ifdef CONFIG_PREEMPT_RT_FULL
9323 +#include <linux/rwsem_rt.h>
9324 +#else /* PREEMPT_RT_FULL */
9325 +
9326  struct rw_semaphore;
9327
9328  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
9329 @@ -106,6 +110,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
9330         return !list_empty(&sem->wait_list);
9331  }
9332
9333 +#endif /* !PREEMPT_RT_FULL */
9334 +
9335 +/*
9336 + * The functions below are the same for all rwsem implementations including
9337 + * the RT specific variant.
9338 + */
9339 +
9340  /*
9341   * lock for reading
9342   */
9343 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
9344 new file mode 100644
9345 index 000000000000..2ffbf093ae92
9346 --- /dev/null
9347 +++ b/include/linux/rwsem_rt.h
9348 @@ -0,0 +1,67 @@
9349 +#ifndef _LINUX_RWSEM_RT_H
9350 +#define _LINUX_RWSEM_RT_H
9351 +
9352 +#ifndef _LINUX_RWSEM_H
9353 +#error "Include rwsem.h"
9354 +#endif
9355 +
9356 +#include <linux/rtmutex.h>
9357 +#include <linux/swait.h>
9358 +
9359 +#define READER_BIAS            (1U << 31)
9360 +#define WRITER_BIAS            (1U << 30)
9361 +
9362 +struct rw_semaphore {
9363 +       atomic_t                readers;
9364 +       struct rt_mutex         rtmutex;
9365 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9366 +       struct lockdep_map      dep_map;
9367 +#endif
9368 +};
9369 +
9370 +#define __RWSEM_INITIALIZER(name)                              \
9371 +{                                                              \
9372 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
9373 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
9374 +       RW_DEP_MAP_INIT(name)                                   \
9375 +}
9376 +
9377 +#define DECLARE_RWSEM(lockname) \
9378 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
9379 +
9380 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
9381 +                         struct lock_class_key *key);
9382 +
9383 +#define __init_rwsem(sem, name, key)                   \
9384 +do {                                                   \
9385 +               rt_mutex_init(&(sem)->rtmutex);         \
9386 +               __rwsem_init((sem), (name), (key));     \
9387 +} while (0)
9388 +
9389 +#define init_rwsem(sem)                                        \
9390 +do {                                                   \
9391 +       static struct lock_class_key __key;             \
9392 +                                                       \
9393 +       __init_rwsem((sem), #sem, &__key);              \
9394 +} while (0)
9395 +
9396 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
9397 +{
9398 +       return atomic_read(&sem->readers) != READER_BIAS;
9399 +}
9400 +
9401 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
9402 +{
9403 +       return atomic_read(&sem->readers) > 0;
9404 +}
9405 +
9406 +extern void __down_read(struct rw_semaphore *sem);
9407 +extern int __down_read_trylock(struct rw_semaphore *sem);
9408 +extern void __down_write(struct rw_semaphore *sem);
9409 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
9410 +extern int __down_write_trylock(struct rw_semaphore *sem);
9411 +extern void __up_read(struct rw_semaphore *sem);
9412 +extern void __up_write(struct rw_semaphore *sem);
9413 +extern void __downgrade_write(struct rw_semaphore *sem);
9414 +
9415 +#endif
9416 diff --git a/include/linux/sched.h b/include/linux/sched.h
9417 index a4d0afc009a7..e775696b480a 100644
9418 --- a/include/linux/sched.h
9419 +++ b/include/linux/sched.h
9420 @@ -26,6 +26,7 @@ struct sched_param {
9421  #include <linux/nodemask.h>
9422  #include <linux/mm_types.h>
9423  #include <linux/preempt.h>
9424 +#include <asm/kmap_types.h>
9425
9426  #include <asm/page.h>
9427  #include <asm/ptrace.h>
9428 @@ -236,17 +237,13 @@ extern char ___assert_task_state[1 - 2*!!(
9429
9430  /* Convenience macros for the sake of wake_up */
9431  #define TASK_NORMAL            (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
9432 -#define TASK_ALL               (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
9433
9434  /* get_task_state() */
9435  #define TASK_REPORT            (TASK_RUNNING | TASK_INTERRUPTIBLE | \
9436                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
9437                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
9438
9439 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
9440  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
9441 -#define task_is_stopped_or_traced(task)        \
9442 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
9443  #define task_contributes_to_load(task) \
9444                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
9445                                  (task->flags & PF_FROZEN) == 0 && \
9446 @@ -312,6 +309,11 @@ extern char ___assert_task_state[1 - 2*!!(
9447
9448  #endif
9449
9450 +#define __set_current_state_no_track(state_value)      \
9451 +       do { current->state = (state_value); } while (0)
9452 +#define set_current_state_no_track(state_value)                \
9453 +       set_mb(current->state, (state_value))
9454 +
9455  /* Task command name length */
9456  #define TASK_COMM_LEN 16
9457
9458 @@ -1022,9 +1024,31 @@ struct wake_q_head {
9459  #define WAKE_Q(name)                                   \
9460         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
9461
9462 -extern void wake_q_add(struct wake_q_head *head,
9463 -                      struct task_struct *task);
9464 -extern void wake_up_q(struct wake_q_head *head);
9465 +extern void __wake_q_add(struct wake_q_head *head,
9466 +                        struct task_struct *task, bool sleeper);
9467 +static inline void wake_q_add(struct wake_q_head *head,
9468 +                             struct task_struct *task)
9469 +{
9470 +       __wake_q_add(head, task, false);
9471 +}
9472 +
9473 +static inline void wake_q_add_sleeper(struct wake_q_head *head,
9474 +                                     struct task_struct *task)
9475 +{
9476 +       __wake_q_add(head, task, true);
9477 +}
9478 +
9479 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
9480 +
9481 +static inline void wake_up_q(struct wake_q_head *head)
9482 +{
9483 +       __wake_up_q(head, false);
9484 +}
9485 +
9486 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
9487 +{
9488 +       __wake_up_q(head, true);
9489 +}
9490
9491  /*
9492   * sched-domains (multiprocessor balancing) declarations:
9493 @@ -1491,6 +1515,7 @@ struct task_struct {
9494         struct thread_info thread_info;
9495  #endif
9496         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9497 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
9498         void *stack;
9499         atomic_t usage;
9500         unsigned int flags;     /* per process flags, defined below */
9501 @@ -1530,6 +1555,13 @@ struct task_struct {
9502  #endif
9503
9504         unsigned int policy;
9505 +#ifdef CONFIG_PREEMPT_RT_FULL
9506 +       int migrate_disable;
9507 +       int migrate_disable_update;
9508 +# ifdef CONFIG_SCHED_DEBUG
9509 +       int migrate_disable_atomic;
9510 +# endif
9511 +#endif
9512         int nr_cpus_allowed;
9513         cpumask_t cpus_allowed;
9514
9515 @@ -1668,6 +1700,9 @@ struct task_struct {
9516
9517         struct task_cputime cputime_expires;
9518         struct list_head cpu_timers[3];
9519 +#ifdef CONFIG_PREEMPT_RT_BASE
9520 +       struct task_struct *posix_timer_list;
9521 +#endif
9522
9523  /* process credentials */
9524         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
9525 @@ -1699,10 +1734,15 @@ struct task_struct {
9526  /* signal handlers */
9527         struct signal_struct *signal;
9528         struct sighand_struct *sighand;
9529 +       struct sigqueue *sigqueue_cache;
9530
9531         sigset_t blocked, real_blocked;
9532         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9533         struct sigpending pending;
9534 +#ifdef CONFIG_PREEMPT_RT_FULL
9535 +       /* TODO: move me into ->restart_block ? */
9536 +       struct siginfo forced_info;
9537 +#endif
9538
9539         unsigned long sas_ss_sp;
9540         size_t sas_ss_size;
9541 @@ -1728,11 +1768,14 @@ struct task_struct {
9542         raw_spinlock_t pi_lock;
9543
9544         struct wake_q_node wake_q;
9545 +       struct wake_q_node wake_q_sleeper;
9546
9547  #ifdef CONFIG_RT_MUTEXES
9548         /* PI waiters blocked on a rt_mutex held by this task */
9549         struct rb_root pi_waiters;
9550         struct rb_node *pi_waiters_leftmost;
9551 +       /* Updated under owner's pi_lock and rq lock */
9552 +       struct task_struct      *pi_top_task;
9553         /* Deadlock detection and priority inheritance handling */
9554         struct rt_mutex_waiter *pi_blocked_on;
9555  #endif
9556 @@ -1931,6 +1974,12 @@ struct task_struct {
9557         /* bitmask and counter of trace recursion */
9558         unsigned long trace_recursion;
9559  #endif /* CONFIG_TRACING */
9560 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
9561 +       u64 preempt_timestamp_hist;
9562 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
9563 +       long timer_offset;
9564 +#endif
9565 +#endif
9566  #ifdef CONFIG_KCOV
9567         /* Coverage collection mode enabled for this task (0 if disabled). */
9568         enum kcov_mode kcov_mode;
9569 @@ -1956,9 +2005,23 @@ struct task_struct {
9570         unsigned int    sequential_io;
9571         unsigned int    sequential_io_avg;
9572  #endif
9573 +#ifdef CONFIG_PREEMPT_RT_BASE
9574 +       struct rcu_head put_rcu;
9575 +       int softirq_nestcnt;
9576 +       unsigned int softirqs_raised;
9577 +#endif
9578 +#ifdef CONFIG_PREEMPT_RT_FULL
9579 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9580 +       int kmap_idx;
9581 +       pte_t kmap_pte[KM_TYPE_NR];
9582 +# endif
9583 +#endif
9584  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9585         unsigned long   task_state_change;
9586  #endif
9587 +#ifdef CONFIG_PREEMPT_RT_FULL
9588 +       int xmit_recursion;
9589 +#endif
9590         int pagefault_disabled;
9591  #ifdef CONFIG_MMU
9592         struct task_struct *oom_reaper_list;
9593 @@ -1998,14 +2061,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
9594  }
9595  #endif
9596
9597 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9598 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9599 -
9600 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9601 -{
9602 -       return p->nr_cpus_allowed;
9603 -}
9604 -
9605  #define TNF_MIGRATED   0x01
9606  #define TNF_NO_GROUP   0x02
9607  #define TNF_SHARED     0x04
9608 @@ -2225,6 +2280,15 @@ extern struct pid *cad_pid;
9609  extern void free_task(struct task_struct *tsk);
9610  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9611
9612 +#ifdef CONFIG_PREEMPT_RT_BASE
9613 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9614 +
9615 +static inline void put_task_struct(struct task_struct *t)
9616 +{
9617 +       if (atomic_dec_and_test(&t->usage))
9618 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9619 +}
9620 +#else
9621  extern void __put_task_struct(struct task_struct *t);
9622
9623  static inline void put_task_struct(struct task_struct *t)
9624 @@ -2232,6 +2296,7 @@ static inline void put_task_struct(struct task_struct *t)
9625         if (atomic_dec_and_test(&t->usage))
9626                 __put_task_struct(t);
9627  }
9628 +#endif
9629
9630  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9631  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9632 @@ -2273,6 +2338,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
9633  /*
9634   * Per process flags
9635   */
9636 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9637  #define PF_EXITING     0x00000004      /* getting shut down */
9638  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9639  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9640 @@ -2441,6 +2507,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
9641
9642  extern int set_cpus_allowed_ptr(struct task_struct *p,
9643                                 const struct cpumask *new_mask);
9644 +int migrate_me(void);
9645 +void tell_sched_cpu_down_begin(int cpu);
9646 +void tell_sched_cpu_down_done(int cpu);
9647 +
9648  #else
9649  static inline void do_set_cpus_allowed(struct task_struct *p,
9650                                       const struct cpumask *new_mask)
9651 @@ -2453,6 +2523,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
9652                 return -EINVAL;
9653         return 0;
9654  }
9655 +static inline int migrate_me(void) { return 0; }
9656 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9657 +static inline void tell_sched_cpu_down_done(int cpu) { }
9658  #endif
9659
9660  #ifdef CONFIG_NO_HZ_COMMON
9661 @@ -2691,6 +2764,7 @@ extern void xtime_update(unsigned long ticks);
9662
9663  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9664  extern int wake_up_process(struct task_struct *tsk);
9665 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9666  extern void wake_up_new_task(struct task_struct *tsk);
9667  #ifdef CONFIG_SMP
9668   extern void kick_process(struct task_struct *tsk);
9669 @@ -2899,6 +2973,17 @@ static inline void mmdrop(struct mm_struct *mm)
9670                 __mmdrop(mm);
9671  }
9672
9673 +#ifdef CONFIG_PREEMPT_RT_BASE
9674 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9675 +static inline void mmdrop_delayed(struct mm_struct *mm)
9676 +{
9677 +       if (atomic_dec_and_test(&mm->mm_count))
9678 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9679 +}
9680 +#else
9681 +# define mmdrop_delayed(mm)    mmdrop(mm)
9682 +#endif
9683 +
9684  static inline void mmdrop_async_fn(struct work_struct *work)
9685  {
9686         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9687 @@ -3291,6 +3376,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
9688         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9689  }
9690
9691 +#ifdef CONFIG_PREEMPT_LAZY
9692 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9693 +{
9694 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9695 +}
9696 +
9697 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9698 +{
9699 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9700 +}
9701 +
9702 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9703 +{
9704 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9705 +}
9706 +
9707 +static inline int need_resched_lazy(void)
9708 +{
9709 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9710 +}
9711 +
9712 +static inline int need_resched_now(void)
9713 +{
9714 +       return test_thread_flag(TIF_NEED_RESCHED);
9715 +}
9716 +
9717 +#else
9718 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9719 +static inline int need_resched_lazy(void) { return 0; }
9720 +
9721 +static inline int need_resched_now(void)
9722 +{
9723 +       return test_thread_flag(TIF_NEED_RESCHED);
9724 +}
9725 +
9726 +#endif
9727 +
9728  static inline int restart_syscall(void)
9729  {
9730         set_tsk_thread_flag(current, TIF_SIGPENDING);
9731 @@ -3322,6 +3444,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
9732         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9733  }
9734
9735 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9736 +{
9737 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9738 +               return true;
9739 +#ifdef CONFIG_PREEMPT_RT_FULL
9740 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9741 +               return true;
9742 +#endif
9743 +       return false;
9744 +}
9745 +
9746 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9747 +{
9748 +       bool traced_stopped;
9749 +
9750 +#ifdef CONFIG_PREEMPT_RT_FULL
9751 +       unsigned long flags;
9752 +
9753 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9754 +       traced_stopped = __task_is_stopped_or_traced(task);
9755 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9756 +#else
9757 +       traced_stopped = __task_is_stopped_or_traced(task);
9758 +#endif
9759 +       return traced_stopped;
9760 +}
9761 +
9762 +static inline bool task_is_traced(struct task_struct *task)
9763 +{
9764 +       bool traced = false;
9765 +
9766 +       if (task->state & __TASK_TRACED)
9767 +               return true;
9768 +#ifdef CONFIG_PREEMPT_RT_FULL
9769 +       /* in case the task is sleeping on tasklist_lock */
9770 +       raw_spin_lock_irq(&task->pi_lock);
9771 +       if (task->state & __TASK_TRACED)
9772 +               traced = true;
9773 +       else if (task->saved_state & __TASK_TRACED)
9774 +               traced = true;
9775 +       raw_spin_unlock_irq(&task->pi_lock);
9776 +#endif
9777 +       return traced;
9778 +}
9779 +
9780  /*
9781   * cond_resched() and cond_resched_lock(): latency reduction via
9782   * explicit rescheduling in places that are safe. The return
9783 @@ -3347,12 +3514,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
9784         __cond_resched_lock(lock);                              \
9785  })
9786
9787 +#ifndef CONFIG_PREEMPT_RT_FULL
9788  extern int __cond_resched_softirq(void);
9789
9790  #define cond_resched_softirq() ({                                      \
9791         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9792         __cond_resched_softirq();                                       \
9793  })
9794 +#else
9795 +# define cond_resched_softirq()                cond_resched()
9796 +#endif
9797
9798  static inline void cond_resched_rcu(void)
9799  {
9800 @@ -3527,6 +3698,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
9801
9802  #endif /* CONFIG_SMP */
9803
9804 +static inline int __migrate_disabled(struct task_struct *p)
9805 +{
9806 +#ifdef CONFIG_PREEMPT_RT_FULL
9807 +       return p->migrate_disable;
9808 +#else
9809 +       return 0;
9810 +#endif
9811 +}
9812 +
9813 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9814 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9815 +{
9816 +       if (__migrate_disabled(p))
9817 +               return cpumask_of(task_cpu(p));
9818 +
9819 +       return &p->cpus_allowed;
9820 +}
9821 +
9822 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9823 +{
9824 +       if (__migrate_disabled(p))
9825 +               return 1;
9826 +       return p->nr_cpus_allowed;
9827 +}
9828 +
9829  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9830  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9831
9832 diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
9833 index a30b172df6e1..db3e91f2bc03 100644
9834 --- a/include/linux/sched/rt.h
9835 +++ b/include/linux/sched/rt.h
9836 @@ -16,27 +16,20 @@ static inline int rt_task(struct task_struct *p)
9837  }
9838
9839  #ifdef CONFIG_RT_MUTEXES
9840 -extern int rt_mutex_getprio(struct task_struct *p);
9841 -extern void rt_mutex_setprio(struct task_struct *p, int prio);
9842 -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
9843 -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
9844 +/*
9845 + * Must hold either p->pi_lock or task_rq(p)->lock.
9846 + */
9847 +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
9848 +{
9849 +       return p->pi_top_task;
9850 +}
9851 +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
9852  extern void rt_mutex_adjust_pi(struct task_struct *p);
9853  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
9854  {
9855         return tsk->pi_blocked_on != NULL;
9856  }
9857  #else
9858 -static inline int rt_mutex_getprio(struct task_struct *p)
9859 -{
9860 -       return p->normal_prio;
9861 -}
9862 -
9863 -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
9864 -                                             int newprio)
9865 -{
9866 -       return newprio;
9867 -}
9868 -
9869  static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
9870  {
9871         return NULL;
9872 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
9873 index ead97654c4e9..3d7223ffdd3b 100644
9874 --- a/include/linux/seqlock.h
9875 +++ b/include/linux/seqlock.h
9876 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
9877         return __read_seqcount_retry(s, start);
9878  }
9879
9880 -
9881 -
9882 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9883 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9884  {
9885         s->sequence++;
9886         smp_wmb();
9887  }
9888
9889 -static inline void raw_write_seqcount_end(seqcount_t *s)
9890 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9891 +{
9892 +       preempt_disable_rt();
9893 +       __raw_write_seqcount_begin(s);
9894 +}
9895 +
9896 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9897  {
9898         smp_wmb();
9899         s->sequence++;
9900  }
9901
9902 +static inline void raw_write_seqcount_end(seqcount_t *s)
9903 +{
9904 +       __raw_write_seqcount_end(s);
9905 +       preempt_enable_rt();
9906 +}
9907 +
9908  /**
9909   * raw_write_seqcount_barrier - do a seq write barrier
9910   * @s: pointer to seqcount_t
9911 @@ -428,10 +438,32 @@ typedef struct {
9912  /*
9913   * Read side functions for starting and finalizing a read side section.
9914   */
9915 +#ifndef CONFIG_PREEMPT_RT_FULL
9916  static inline unsigned read_seqbegin(const seqlock_t *sl)
9917  {
9918         return read_seqcount_begin(&sl->seqcount);
9919  }
9920 +#else
9921 +/*
9922 + * Starvation safe read side for RT
9923 + */
9924 +static inline unsigned read_seqbegin(seqlock_t *sl)
9925 +{
9926 +       unsigned ret;
9927 +
9928 +repeat:
9929 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9930 +       if (unlikely(ret & 1)) {
9931 +               /*
9932 +                * Take the lock and let the writer proceed (i.e. evtl
9933 +                * boost it), otherwise we could loop here forever.
9934 +                */
9935 +               spin_unlock_wait(&sl->lock);
9936 +               goto repeat;
9937 +       }
9938 +       return ret;
9939 +}
9940 +#endif
9941
9942  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9943  {
9944 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9945  static inline void write_seqlock(seqlock_t *sl)
9946  {
9947         spin_lock(&sl->lock);
9948 -       write_seqcount_begin(&sl->seqcount);
9949 +       __raw_write_seqcount_begin(&sl->seqcount);
9950 +}
9951 +
9952 +static inline int try_write_seqlock(seqlock_t *sl)
9953 +{
9954 +       if (spin_trylock(&sl->lock)) {
9955 +               __raw_write_seqcount_begin(&sl->seqcount);
9956 +               return 1;
9957 +       }
9958 +       return 0;
9959  }
9960
9961  static inline void write_sequnlock(seqlock_t *sl)
9962  {
9963 -       write_seqcount_end(&sl->seqcount);
9964 +       __raw_write_seqcount_end(&sl->seqcount);
9965         spin_unlock(&sl->lock);
9966  }
9967
9968  static inline void write_seqlock_bh(seqlock_t *sl)
9969  {
9970         spin_lock_bh(&sl->lock);
9971 -       write_seqcount_begin(&sl->seqcount);
9972 +       __raw_write_seqcount_begin(&sl->seqcount);
9973  }
9974
9975  static inline void write_sequnlock_bh(seqlock_t *sl)
9976  {
9977 -       write_seqcount_end(&sl->seqcount);
9978 +       __raw_write_seqcount_end(&sl->seqcount);
9979         spin_unlock_bh(&sl->lock);
9980  }
9981
9982  static inline void write_seqlock_irq(seqlock_t *sl)
9983  {
9984         spin_lock_irq(&sl->lock);
9985 -       write_seqcount_begin(&sl->seqcount);
9986 +       __raw_write_seqcount_begin(&sl->seqcount);
9987  }
9988
9989  static inline void write_sequnlock_irq(seqlock_t *sl)
9990  {
9991 -       write_seqcount_end(&sl->seqcount);
9992 +       __raw_write_seqcount_end(&sl->seqcount);
9993         spin_unlock_irq(&sl->lock);
9994  }
9995
9996 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9997         unsigned long flags;
9998
9999         spin_lock_irqsave(&sl->lock, flags);
10000 -       write_seqcount_begin(&sl->seqcount);
10001 +       __raw_write_seqcount_begin(&sl->seqcount);
10002         return flags;
10003  }
10004
10005 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
10006  static inline void
10007  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
10008  {
10009 -       write_seqcount_end(&sl->seqcount);
10010 +       __raw_write_seqcount_end(&sl->seqcount);
10011         spin_unlock_irqrestore(&sl->lock, flags);
10012  }
10013
10014 diff --git a/include/linux/signal.h b/include/linux/signal.h
10015 index b63f63eaa39c..295540fdfc72 100644
10016 --- a/include/linux/signal.h
10017 +++ b/include/linux/signal.h
10018 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
10019  }
10020
10021  extern void flush_sigqueue(struct sigpending *queue);
10022 +extern void flush_task_sigqueue(struct task_struct *tsk);
10023
10024  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
10025  static inline int valid_signal(unsigned long sig)
10026 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
10027 index 601dfa849d30..dca387a8fa6b 100644
10028 --- a/include/linux/skbuff.h
10029 +++ b/include/linux/skbuff.h
10030 @@ -284,6 +284,7 @@ struct sk_buff_head {
10031
10032         __u32           qlen;
10033         spinlock_t      lock;
10034 +       raw_spinlock_t  raw_lock;
10035  };
10036
10037  struct sk_buff;
10038 @@ -1573,6 +1574,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
10039         __skb_queue_head_init(list);
10040  }
10041
10042 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
10043 +{
10044 +       raw_spin_lock_init(&list->raw_lock);
10045 +       __skb_queue_head_init(list);
10046 +}
10047 +
10048  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
10049                 struct lock_class_key *class)
10050  {
10051 diff --git a/include/linux/smp.h b/include/linux/smp.h
10052 index 8e0cb7a0f836..891c533724f5 100644
10053 --- a/include/linux/smp.h
10054 +++ b/include/linux/smp.h
10055 @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
10056  extern void __init setup_nr_cpu_ids(void);
10057  extern void __init smp_init(void);
10058
10059 +extern int __boot_cpu_id;
10060 +
10061 +static inline int get_boot_cpu_id(void)
10062 +{
10063 +       return __boot_cpu_id;
10064 +}
10065 +
10066  #else /* !SMP */
10067
10068  static inline void smp_send_stop(void) { }
10069 @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
10070  static inline void smp_init(void) { }
10071  #endif
10072
10073 +static inline int get_boot_cpu_id(void)
10074 +{
10075 +       return 0;
10076 +}
10077 +
10078  #endif /* !SMP */
10079
10080  /*
10081 @@ -185,6 +197,9 @@ static inline void smp_init(void) { }
10082  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
10083  #define put_cpu()              preempt_enable()
10084
10085 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
10086 +#define put_cpu_light()                migrate_enable()
10087 +
10088  /*
10089   * Callback to arch code if there's nosmp or maxcpus=0 on the
10090   * boot command line:
10091 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
10092 index 47dd0cebd204..b241cc044bd3 100644
10093 --- a/include/linux/spinlock.h
10094 +++ b/include/linux/spinlock.h
10095 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
10096  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
10097
10098  /* Include rwlock functions */
10099 -#include <linux/rwlock.h>
10100 +#ifdef CONFIG_PREEMPT_RT_FULL
10101 +# include <linux/rwlock_rt.h>
10102 +#else
10103 +# include <linux/rwlock.h>
10104 +#endif
10105
10106  /*
10107   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
10108 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
10109  # include <linux/spinlock_api_up.h>
10110  #endif
10111
10112 +#ifdef CONFIG_PREEMPT_RT_FULL
10113 +# include <linux/spinlock_rt.h>
10114 +#else /* PREEMPT_RT_FULL */
10115 +
10116  /*
10117   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
10118   */
10119 @@ -416,4 +424,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
10120  #define atomic_dec_and_lock(atomic, lock) \
10121                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
10122
10123 +#endif /* !PREEMPT_RT_FULL */
10124 +
10125  #endif /* __LINUX_SPINLOCK_H */
10126 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
10127 index 5344268e6e62..043263f30e81 100644
10128 --- a/include/linux/spinlock_api_smp.h
10129 +++ b/include/linux/spinlock_api_smp.h
10130 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
10131         return 0;
10132  }
10133
10134 -#include <linux/rwlock_api_smp.h>
10135 +#ifndef CONFIG_PREEMPT_RT_FULL
10136 +# include <linux/rwlock_api_smp.h>
10137 +#endif
10138
10139  #endif /* __LINUX_SPINLOCK_API_SMP_H */
10140 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
10141 new file mode 100644
10142 index 000000000000..43ca841b913a
10143 --- /dev/null
10144 +++ b/include/linux/spinlock_rt.h
10145 @@ -0,0 +1,162 @@
10146 +#ifndef __LINUX_SPINLOCK_RT_H
10147 +#define __LINUX_SPINLOCK_RT_H
10148 +
10149 +#ifndef __LINUX_SPINLOCK_H
10150 +#error Do not include directly. Use spinlock.h
10151 +#endif
10152 +
10153 +#include <linux/bug.h>
10154 +
10155 +extern void
10156 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
10157 +
10158 +#define spin_lock_init(slock)                          \
10159 +do {                                                   \
10160 +       static struct lock_class_key __key;             \
10161 +                                                       \
10162 +       rt_mutex_init(&(slock)->lock);                  \
10163 +       __rt_spin_lock_init(slock, #slock, &__key);     \
10164 +} while (0)
10165 +
10166 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
10167 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
10168 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
10169 +
10170 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
10171 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
10172 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
10173 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
10174 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
10175 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
10176 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
10177 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
10178 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
10179 +
10180 +/*
10181 + * lockdep-less calls, for derived types like rwlock:
10182 + * (for trylock they can use rt_mutex_trylock() directly.
10183 + */
10184 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
10185 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
10186 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
10187 +
10188 +#define spin_lock(lock)                        rt_spin_lock(lock)
10189 +
10190 +#define spin_lock_bh(lock)                     \
10191 +       do {                                    \
10192 +               local_bh_disable();             \
10193 +               rt_spin_lock(lock);             \
10194 +       } while (0)
10195 +
10196 +#define spin_lock_irq(lock)            spin_lock(lock)
10197 +
10198 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
10199 +
10200 +#define spin_trylock(lock)                     \
10201 +({                                             \
10202 +       int __locked;                           \
10203 +       __locked = spin_do_trylock(lock);       \
10204 +       __locked;                               \
10205 +})
10206 +
10207 +#ifdef CONFIG_LOCKDEP
10208 +# define spin_lock_nested(lock, subclass)              \
10209 +       do {                                            \
10210 +               rt_spin_lock_nested(lock, subclass);    \
10211 +       } while (0)
10212 +
10213 +#define spin_lock_bh_nested(lock, subclass)            \
10214 +       do {                                            \
10215 +               local_bh_disable();                     \
10216 +               rt_spin_lock_nested(lock, subclass);    \
10217 +       } while (0)
10218 +
10219 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
10220 +       do {                                             \
10221 +               typecheck(unsigned long, flags);         \
10222 +               flags = 0;                               \
10223 +               rt_spin_lock_nested(lock, subclass);     \
10224 +       } while (0)
10225 +#else
10226 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
10227 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
10228 +
10229 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
10230 +       do {                                             \
10231 +               typecheck(unsigned long, flags);         \
10232 +               flags = 0;                               \
10233 +               spin_lock(lock);                         \
10234 +       } while (0)
10235 +#endif
10236 +
10237 +#define spin_lock_irqsave(lock, flags)                  \
10238 +       do {                                             \
10239 +               typecheck(unsigned long, flags);         \
10240 +               flags = 0;                               \
10241 +               spin_lock(lock);                         \
10242 +       } while (0)
10243 +
10244 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
10245 +{
10246 +       unsigned long flags = 0;
10247 +#ifdef CONFIG_TRACE_IRQFLAGS
10248 +       flags = rt_spin_lock_trace_flags(lock);
10249 +#else
10250 +       spin_lock(lock); /* lock_local */
10251 +#endif
10252 +       return flags;
10253 +}
10254 +
10255 +/* FIXME: we need rt_spin_lock_nest_lock */
10256 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
10257 +
10258 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
10259 +
10260 +#define spin_unlock_bh(lock)                           \
10261 +       do {                                            \
10262 +               rt_spin_unlock(lock);                   \
10263 +               local_bh_enable();                      \
10264 +       } while (0)
10265 +
10266 +#define spin_unlock_irq(lock)          spin_unlock(lock)
10267 +
10268 +#define spin_unlock_irqrestore(lock, flags)            \
10269 +       do {                                            \
10270 +               typecheck(unsigned long, flags);        \
10271 +               (void) flags;                           \
10272 +               spin_unlock(lock);                      \
10273 +       } while (0)
10274 +
10275 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
10276 +#define spin_trylock_irq(lock) spin_trylock(lock)
10277 +
10278 +#define spin_trylock_irqsave(lock, flags)      \
10279 +       rt_spin_trylock_irqsave(lock, &(flags))
10280 +
10281 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
10282 +
10283 +#ifdef CONFIG_GENERIC_LOCKBREAK
10284 +# define spin_is_contended(lock)       ((lock)->break_lock)
10285 +#else
10286 +# define spin_is_contended(lock)       (((void)(lock), 0))
10287 +#endif
10288 +
10289 +static inline int spin_can_lock(spinlock_t *lock)
10290 +{
10291 +       return !rt_mutex_is_locked(&lock->lock);
10292 +}
10293 +
10294 +static inline int spin_is_locked(spinlock_t *lock)
10295 +{
10296 +       return rt_mutex_is_locked(&lock->lock);
10297 +}
10298 +
10299 +static inline void assert_spin_locked(spinlock_t *lock)
10300 +{
10301 +       BUG_ON(!spin_is_locked(lock));
10302 +}
10303 +
10304 +#define atomic_dec_and_lock(atomic, lock) \
10305 +       atomic_dec_and_spin_lock(atomic, lock)
10306 +
10307 +#endif
10308 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
10309 index 73548eb13a5d..10bac715ea96 100644
10310 --- a/include/linux/spinlock_types.h
10311 +++ b/include/linux/spinlock_types.h
10312 @@ -9,80 +9,15 @@
10313   * Released under the General Public License (GPL).
10314   */
10315
10316 -#if defined(CONFIG_SMP)
10317 -# include <asm/spinlock_types.h>
10318 -#else
10319 -# include <linux/spinlock_types_up.h>
10320 -#endif
10321 -
10322 -#include <linux/lockdep.h>
10323 -
10324 -typedef struct raw_spinlock {
10325 -       arch_spinlock_t raw_lock;
10326 -#ifdef CONFIG_GENERIC_LOCKBREAK
10327 -       unsigned int break_lock;
10328 -#endif
10329 -#ifdef CONFIG_DEBUG_SPINLOCK
10330 -       unsigned int magic, owner_cpu;
10331 -       void *owner;
10332 -#endif
10333 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10334 -       struct lockdep_map dep_map;
10335 -#endif
10336 -} raw_spinlock_t;
10337 -
10338 -#define SPINLOCK_MAGIC         0xdead4ead
10339 -
10340 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10341 -
10342 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10343 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10344 -#else
10345 -# define SPIN_DEP_MAP_INIT(lockname)
10346 -#endif
10347 +#include <linux/spinlock_types_raw.h>
10348
10349 -#ifdef CONFIG_DEBUG_SPINLOCK
10350 -# define SPIN_DEBUG_INIT(lockname)             \
10351 -       .magic = SPINLOCK_MAGIC,                \
10352 -       .owner_cpu = -1,                        \
10353 -       .owner = SPINLOCK_OWNER_INIT,
10354 +#ifndef CONFIG_PREEMPT_RT_FULL
10355 +# include <linux/spinlock_types_nort.h>
10356 +# include <linux/rwlock_types.h>
10357  #else
10358 -# define SPIN_DEBUG_INIT(lockname)
10359 +# include <linux/rtmutex.h>
10360 +# include <linux/spinlock_types_rt.h>
10361 +# include <linux/rwlock_types_rt.h>
10362  #endif
10363
10364 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10365 -       {                                       \
10366 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10367 -       SPIN_DEBUG_INIT(lockname)               \
10368 -       SPIN_DEP_MAP_INIT(lockname) }
10369 -
10370 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10371 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10372 -
10373 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10374 -
10375 -typedef struct spinlock {
10376 -       union {
10377 -               struct raw_spinlock rlock;
10378 -
10379 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10380 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10381 -               struct {
10382 -                       u8 __padding[LOCK_PADSIZE];
10383 -                       struct lockdep_map dep_map;
10384 -               };
10385 -#endif
10386 -       };
10387 -} spinlock_t;
10388 -
10389 -#define __SPIN_LOCK_INITIALIZER(lockname) \
10390 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10391 -
10392 -#define __SPIN_LOCK_UNLOCKED(lockname) \
10393 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10394 -
10395 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10396 -
10397 -#include <linux/rwlock_types.h>
10398 -
10399  #endif /* __LINUX_SPINLOCK_TYPES_H */
10400 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
10401 new file mode 100644
10402 index 000000000000..f1dac1fb1d6a
10403 --- /dev/null
10404 +++ b/include/linux/spinlock_types_nort.h
10405 @@ -0,0 +1,33 @@
10406 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
10407 +#define __LINUX_SPINLOCK_TYPES_NORT_H
10408 +
10409 +#ifndef __LINUX_SPINLOCK_TYPES_H
10410 +#error "Do not include directly. Include spinlock_types.h instead"
10411 +#endif
10412 +
10413 +/*
10414 + * The non RT version maps spinlocks to raw_spinlocks
10415 + */
10416 +typedef struct spinlock {
10417 +       union {
10418 +               struct raw_spinlock rlock;
10419 +
10420 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10421 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10422 +               struct {
10423 +                       u8 __padding[LOCK_PADSIZE];
10424 +                       struct lockdep_map dep_map;
10425 +               };
10426 +#endif
10427 +       };
10428 +} spinlock_t;
10429 +
10430 +#define __SPIN_LOCK_INITIALIZER(lockname) \
10431 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10432 +
10433 +#define __SPIN_LOCK_UNLOCKED(lockname) \
10434 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10435 +
10436 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10437 +
10438 +#endif
10439 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
10440 new file mode 100644
10441 index 000000000000..edffc4d53fc9
10442 --- /dev/null
10443 +++ b/include/linux/spinlock_types_raw.h
10444 @@ -0,0 +1,56 @@
10445 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
10446 +#define __LINUX_SPINLOCK_TYPES_RAW_H
10447 +
10448 +#if defined(CONFIG_SMP)
10449 +# include <asm/spinlock_types.h>
10450 +#else
10451 +# include <linux/spinlock_types_up.h>
10452 +#endif
10453 +
10454 +#include <linux/lockdep.h>
10455 +
10456 +typedef struct raw_spinlock {
10457 +       arch_spinlock_t raw_lock;
10458 +#ifdef CONFIG_GENERIC_LOCKBREAK
10459 +       unsigned int break_lock;
10460 +#endif
10461 +#ifdef CONFIG_DEBUG_SPINLOCK
10462 +       unsigned int magic, owner_cpu;
10463 +       void *owner;
10464 +#endif
10465 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10466 +       struct lockdep_map dep_map;
10467 +#endif
10468 +} raw_spinlock_t;
10469 +
10470 +#define SPINLOCK_MAGIC         0xdead4ead
10471 +
10472 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10473 +
10474 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10475 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10476 +#else
10477 +# define SPIN_DEP_MAP_INIT(lockname)
10478 +#endif
10479 +
10480 +#ifdef CONFIG_DEBUG_SPINLOCK
10481 +# define SPIN_DEBUG_INIT(lockname)             \
10482 +       .magic = SPINLOCK_MAGIC,                \
10483 +       .owner_cpu = -1,                        \
10484 +       .owner = SPINLOCK_OWNER_INIT,
10485 +#else
10486 +# define SPIN_DEBUG_INIT(lockname)
10487 +#endif
10488 +
10489 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10490 +       {                                       \
10491 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10492 +       SPIN_DEBUG_INIT(lockname)               \
10493 +       SPIN_DEP_MAP_INIT(lockname) }
10494 +
10495 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10496 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10497 +
10498 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10499 +
10500 +#endif
10501 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
10502 new file mode 100644
10503 index 000000000000..3e3d8c5f7a9a
10504 --- /dev/null
10505 +++ b/include/linux/spinlock_types_rt.h
10506 @@ -0,0 +1,48 @@
10507 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
10508 +#define __LINUX_SPINLOCK_TYPES_RT_H
10509 +
10510 +#ifndef __LINUX_SPINLOCK_TYPES_H
10511 +#error "Do not include directly. Include spinlock_types.h instead"
10512 +#endif
10513 +
10514 +#include <linux/cache.h>
10515 +
10516 +/*
10517 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
10518 + */
10519 +typedef struct spinlock {
10520 +       struct rt_mutex         lock;
10521 +       unsigned int            break_lock;
10522 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10523 +       struct lockdep_map      dep_map;
10524 +#endif
10525 +} spinlock_t;
10526 +
10527 +#ifdef CONFIG_DEBUG_RT_MUTEXES
10528 +# define __RT_SPIN_INITIALIZER(name) \
10529 +       { \
10530 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
10531 +       .save_state = 1, \
10532 +       .file = __FILE__, \
10533 +       .line = __LINE__ , \
10534 +       }
10535 +#else
10536 +# define __RT_SPIN_INITIALIZER(name) \
10537 +       {                                                               \
10538 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
10539 +       .save_state = 1, \
10540 +       }
10541 +#endif
10542 +
10543 +/*
10544 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
10545 +*/
10546 +
10547 +#define __SPIN_LOCK_UNLOCKED(name)                     \
10548 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
10549 +         SPIN_DEP_MAP_INIT(name) }
10550 +
10551 +#define DEFINE_SPINLOCK(name) \
10552 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
10553 +
10554 +#endif
10555 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
10556 index dc8eb63c6568..e793d3a257da 100644
10557 --- a/include/linux/srcu.h
10558 +++ b/include/linux/srcu.h
10559 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
10560
10561  void process_srcu(struct work_struct *work);
10562
10563 -#define __SRCU_STRUCT_INIT(name)                                       \
10564 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
10565         {                                                               \
10566                 .completed = -300,                                      \
10567 -               .per_cpu_ref = &name##_srcu_array,                      \
10568 +               .per_cpu_ref = &pcpu_name,                              \
10569                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
10570                 .running = false,                                       \
10571                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
10572 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
10573   */
10574  #define __DEFINE_SRCU(name, is_static)                                 \
10575         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
10576 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
10577 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
10578  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
10579  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
10580
10581 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
10582 index d9718378a8be..e81e6dc7dcb1 100644
10583 --- a/include/linux/suspend.h
10584 +++ b/include/linux/suspend.h
10585 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
10586         void (*end)(void);
10587  };
10588
10589 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
10590 +extern bool pm_in_action;
10591 +#else
10592 +# define pm_in_action false
10593 +#endif
10594 +
10595  #ifdef CONFIG_SUSPEND
10596  /**
10597   * suspend_set_ops - set platform dependent suspend operations
10598 diff --git a/include/linux/swait.h b/include/linux/swait.h
10599 index c1f9c62a8a50..83f004a72320 100644
10600 --- a/include/linux/swait.h
10601 +++ b/include/linux/swait.h
10602 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
10603  extern void swake_up(struct swait_queue_head *q);
10604  extern void swake_up_all(struct swait_queue_head *q);
10605  extern void swake_up_locked(struct swait_queue_head *q);
10606 +extern void swake_up_all_locked(struct swait_queue_head *q);
10607
10608  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
10609  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
10610 diff --git a/include/linux/swap.h b/include/linux/swap.h
10611 index 55ff5593c193..52bf5477dc92 100644
10612 --- a/include/linux/swap.h
10613 +++ b/include/linux/swap.h
10614 @@ -11,6 +11,7 @@
10615  #include <linux/fs.h>
10616  #include <linux/atomic.h>
10617  #include <linux/page-flags.h>
10618 +#include <linux/locallock.h>
10619  #include <asm/page.h>
10620
10621  struct notifier_block;
10622 @@ -247,7 +248,8 @@ struct swap_info_struct {
10623  void *workingset_eviction(struct address_space *mapping, struct page *page);
10624  bool workingset_refault(void *shadow);
10625  void workingset_activation(struct page *page);
10626 -extern struct list_lru workingset_shadow_nodes;
10627 +extern struct list_lru __workingset_shadow_nodes;
10628 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
10629
10630  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10631  {
10632 @@ -292,6 +294,7 @@ extern unsigned long nr_free_pagecache_pages(void);
10633
10634
10635  /* linux/mm/swap.c */
10636 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10637  extern void lru_cache_add(struct page *);
10638  extern void lru_cache_add_anon(struct page *page);
10639  extern void lru_cache_add_file(struct page *page);
10640 diff --git a/include/linux/swork.h b/include/linux/swork.h
10641 new file mode 100644
10642 index 000000000000..f175fa9a6016
10643 --- /dev/null
10644 +++ b/include/linux/swork.h
10645 @@ -0,0 +1,24 @@
10646 +#ifndef _LINUX_SWORK_H
10647 +#define _LINUX_SWORK_H
10648 +
10649 +#include <linux/list.h>
10650 +
10651 +struct swork_event {
10652 +       struct list_head item;
10653 +       unsigned long flags;
10654 +       void (*func)(struct swork_event *);
10655 +};
10656 +
10657 +static inline void INIT_SWORK(struct swork_event *event,
10658 +                             void (*func)(struct swork_event *))
10659 +{
10660 +       event->flags = 0;
10661 +       event->func = func;
10662 +}
10663 +
10664 +bool swork_queue(struct swork_event *sev);
10665 +
10666 +int swork_get(void);
10667 +void swork_put(void);
10668 +
10669 +#endif /* _LINUX_SWORK_H */
10670 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
10671 index 2873baf5372a..eb1a108f17ca 100644
10672 --- a/include/linux/thread_info.h
10673 +++ b/include/linux/thread_info.h
10674 @@ -107,7 +107,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
10675  #define test_thread_flag(flag) \
10676         test_ti_thread_flag(current_thread_info(), flag)
10677
10678 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10679 +#ifdef CONFIG_PREEMPT_LAZY
10680 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10681 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10682 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10683 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10684 +
10685 +#else
10686 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10687 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10688 +#define tif_need_resched_lazy()        0
10689 +#endif
10690
10691  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10692  static inline int arch_within_stack_frames(const void * const stack,
10693 diff --git a/include/linux/timer.h b/include/linux/timer.h
10694 index 51d601f192d4..83cea629efe1 100644
10695 --- a/include/linux/timer.h
10696 +++ b/include/linux/timer.h
10697 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
10698
10699  extern int try_to_del_timer_sync(struct timer_list *timer);
10700
10701 -#ifdef CONFIG_SMP
10702 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10703    extern int del_timer_sync(struct timer_list *timer);
10704  #else
10705  # define del_timer_sync(t)             del_timer(t)
10706 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
10707 index ba57266d9e80..5c36934ec2bc 100644
10708 --- a/include/linux/trace_events.h
10709 +++ b/include/linux/trace_events.h
10710 @@ -56,6 +56,9 @@ struct trace_entry {
10711         unsigned char           flags;
10712         unsigned char           preempt_count;
10713         int                     pid;
10714 +       unsigned short          migrate_disable;
10715 +       unsigned short          padding;
10716 +       unsigned char           preempt_lazy_count;
10717  };
10718
10719  #define TRACE_EVENT_TYPE_MAX                                           \
10720 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
10721 index f30c187ed785..83bf0f798426 100644
10722 --- a/include/linux/uaccess.h
10723 +++ b/include/linux/uaccess.h
10724 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
10725   */
10726  static inline void pagefault_disable(void)
10727  {
10728 +       migrate_disable();
10729         pagefault_disabled_inc();
10730         /*
10731          * make sure to have issued the store before a pagefault
10732 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
10733          */
10734         barrier();
10735         pagefault_disabled_dec();
10736 +       migrate_enable();
10737  }
10738
10739  /*
10740 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
10741 index 4a29c75b146e..0a294e950df8 100644
10742 --- a/include/linux/uprobes.h
10743 +++ b/include/linux/uprobes.h
10744 @@ -27,6 +27,7 @@
10745  #include <linux/errno.h>
10746  #include <linux/rbtree.h>
10747  #include <linux/types.h>
10748 +#include <linux/wait.h>
10749
10750  struct vm_area_struct;
10751  struct mm_struct;
10752 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
10753 index 613771909b6e..e28c5a43229d 100644
10754 --- a/include/linux/vmstat.h
10755 +++ b/include/linux/vmstat.h
10756 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
10757   */
10758  static inline void __count_vm_event(enum vm_event_item item)
10759  {
10760 +       preempt_disable_rt();
10761         raw_cpu_inc(vm_event_states.event[item]);
10762 +       preempt_enable_rt();
10763  }
10764
10765  static inline void count_vm_event(enum vm_event_item item)
10766 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
10767
10768  static inline void __count_vm_events(enum vm_event_item item, long delta)
10769  {
10770 +       preempt_disable_rt();
10771         raw_cpu_add(vm_event_states.event[item], delta);
10772 +       preempt_enable_rt();
10773  }
10774
10775  static inline void count_vm_events(enum vm_event_item item, long delta)
10776 diff --git a/include/linux/wait.h b/include/linux/wait.h
10777 index 2408e8d5c05c..db50d6609195 100644
10778 --- a/include/linux/wait.h
10779 +++ b/include/linux/wait.h
10780 @@ -8,6 +8,7 @@
10781  #include <linux/spinlock.h>
10782  #include <asm/current.h>
10783  #include <uapi/linux/wait.h>
10784 +#include <linux/atomic.h>
10785
10786  typedef struct __wait_queue wait_queue_t;
10787  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10788 diff --git a/include/net/dst.h b/include/net/dst.h
10789 index ddcff17615da..a1fc787b1a8c 100644
10790 --- a/include/net/dst.h
10791 +++ b/include/net/dst.h
10792 @@ -452,7 +452,7 @@ static inline void dst_confirm(struct dst_entry *dst)
10793  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10794                                    struct sk_buff *skb)
10795  {
10796 -       const struct hh_cache *hh;
10797 +       struct hh_cache *hh;
10798
10799         if (dst->pending_confirm) {
10800                 unsigned long now = jiffies;
10801 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
10802 index 231e121cc7d9..d125222b979d 100644
10803 --- a/include/net/gen_stats.h
10804 +++ b/include/net/gen_stats.h
10805 @@ -5,6 +5,7 @@
10806  #include <linux/socket.h>
10807  #include <linux/rtnetlink.h>
10808  #include <linux/pkt_sched.h>
10809 +#include <net/net_seq_lock.h>
10810
10811  struct gnet_stats_basic_cpu {
10812         struct gnet_stats_basic_packed bstats;
10813 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
10814                                  spinlock_t *lock, struct gnet_dump *d,
10815                                  int padattr);
10816
10817 -int gnet_stats_copy_basic(const seqcount_t *running,
10818 +int gnet_stats_copy_basic(net_seqlock_t *running,
10819                           struct gnet_dump *d,
10820                           struct gnet_stats_basic_cpu __percpu *cpu,
10821                           struct gnet_stats_basic_packed *b);
10822 -void __gnet_stats_copy_basic(const seqcount_t *running,
10823 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10824                              struct gnet_stats_basic_packed *bstats,
10825                              struct gnet_stats_basic_cpu __percpu *cpu,
10826                              struct gnet_stats_basic_packed *b);
10827 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
10828                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10829                       struct gnet_stats_rate_est64 *rate_est,
10830                       spinlock_t *stats_lock,
10831 -                     seqcount_t *running, struct nlattr *opt);
10832 +                     net_seqlock_t *running, struct nlattr *opt);
10833  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10834                         struct gnet_stats_rate_est64 *rate_est);
10835  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10836                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10837                           struct gnet_stats_rate_est64 *rate_est,
10838                           spinlock_t *stats_lock,
10839 -                         seqcount_t *running, struct nlattr *opt);
10840 +                         net_seqlock_t *running, struct nlattr *opt);
10841  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10842                           const struct gnet_stats_rate_est64 *rate_est);
10843  #endif
10844 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
10845 index 8b683841e574..bf656008f6e7 100644
10846 --- a/include/net/neighbour.h
10847 +++ b/include/net/neighbour.h
10848 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
10849  }
10850  #endif
10851
10852 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10853 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10854  {
10855         unsigned int seq;
10856         int hh_len;
10857 @@ -501,7 +501,7 @@ struct neighbour_cb {
10858
10859  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10860
10861 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10862 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10863                                      const struct net_device *dev)
10864  {
10865         unsigned int seq;
10866 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
10867 new file mode 100644
10868 index 000000000000..a7034298a82a
10869 --- /dev/null
10870 +++ b/include/net/net_seq_lock.h
10871 @@ -0,0 +1,15 @@
10872 +#ifndef __NET_NET_SEQ_LOCK_H__
10873 +#define __NET_NET_SEQ_LOCK_H__
10874 +
10875 +#ifdef CONFIG_PREEMPT_RT_BASE
10876 +# define net_seqlock_t                 seqlock_t
10877 +# define net_seq_begin(__r)            read_seqbegin(__r)
10878 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10879 +
10880 +#else
10881 +# define net_seqlock_t                 seqcount_t
10882 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10883 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10884 +#endif
10885 +
10886 +#endif
10887 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
10888 index 7adf4386ac8f..d3fd5c357268 100644
10889 --- a/include/net/netns/ipv4.h
10890 +++ b/include/net/netns/ipv4.h
10891 @@ -69,6 +69,7 @@ struct netns_ipv4 {
10892
10893         int sysctl_icmp_echo_ignore_all;
10894         int sysctl_icmp_echo_ignore_broadcasts;
10895 +       int sysctl_icmp_echo_sysrq;
10896         int sysctl_icmp_ignore_bogus_error_responses;
10897         int sysctl_icmp_ratelimit;
10898         int sysctl_icmp_ratemask;
10899 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
10900 index f18fc1a0321f..5d2c9b89c168 100644
10901 --- a/include/net/sch_generic.h
10902 +++ b/include/net/sch_generic.h
10903 @@ -10,6 +10,7 @@
10904  #include <linux/dynamic_queue_limits.h>
10905  #include <net/gen_stats.h>
10906  #include <net/rtnetlink.h>
10907 +#include <net/net_seq_lock.h>
10908
10909  struct Qdisc_ops;
10910  struct qdisc_walker;
10911 @@ -86,7 +87,7 @@ struct Qdisc {
10912         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10913         struct qdisc_skb_head   q;
10914         struct gnet_stats_basic_packed bstats;
10915 -       seqcount_t              running;
10916 +       net_seqlock_t           running;
10917         struct gnet_stats_queue qstats;
10918         unsigned long           state;
10919         struct Qdisc            *next_sched;
10920 @@ -98,13 +99,22 @@ struct Qdisc {
10921         spinlock_t              busylock ____cacheline_aligned_in_smp;
10922  };
10923
10924 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10925 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10926  {
10927 +#ifdef CONFIG_PREEMPT_RT_BASE
10928 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10929 +#else
10930         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10931 +#endif
10932  }
10933
10934  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10935  {
10936 +#ifdef CONFIG_PREEMPT_RT_BASE
10937 +       if (try_write_seqlock(&qdisc->running))
10938 +               return true;
10939 +       return false;
10940 +#else
10941         if (qdisc_is_running(qdisc))
10942                 return false;
10943         /* Variant of write_seqcount_begin() telling lockdep a trylock
10944 @@ -113,11 +123,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10945         raw_write_seqcount_begin(&qdisc->running);
10946         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10947         return true;
10948 +#endif
10949  }
10950
10951  static inline void qdisc_run_end(struct Qdisc *qdisc)
10952  {
10953 +#ifdef CONFIG_PREEMPT_RT_BASE
10954 +       write_sequnlock(&qdisc->running);
10955 +#else
10956         write_seqcount_end(&qdisc->running);
10957 +#endif
10958  }
10959
10960  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10961 @@ -308,7 +323,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
10962         return qdisc_lock(root);
10963  }
10964
10965 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10966 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10967  {
10968         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10969
10970 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
10971 new file mode 100644
10972 index 000000000000..f7710de1b1f3
10973 --- /dev/null
10974 +++ b/include/trace/events/hist.h
10975 @@ -0,0 +1,73 @@
10976 +#undef TRACE_SYSTEM
10977 +#define TRACE_SYSTEM hist
10978 +
10979 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10980 +#define _TRACE_HIST_H
10981 +
10982 +#include "latency_hist.h"
10983 +#include <linux/tracepoint.h>
10984 +
10985 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10986 +#define trace_preemptirqsoff_hist(a, b)
10987 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10988 +#else
10989 +TRACE_EVENT(preemptirqsoff_hist,
10990 +
10991 +       TP_PROTO(int reason, int starthist),
10992 +
10993 +       TP_ARGS(reason, starthist),
10994 +
10995 +       TP_STRUCT__entry(
10996 +               __field(int,    reason)
10997 +               __field(int,    starthist)
10998 +       ),
10999 +
11000 +       TP_fast_assign(
11001 +               __entry->reason         = reason;
11002 +               __entry->starthist      = starthist;
11003 +       ),
11004 +
11005 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
11006 +                 __entry->starthist ? "start" : "stop")
11007 +);
11008 +#endif
11009 +
11010 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
11011 +#define trace_hrtimer_interrupt(a, b, c, d)
11012 +#else
11013 +TRACE_EVENT(hrtimer_interrupt,
11014 +
11015 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
11016 +               struct task_struct *task),
11017 +
11018 +       TP_ARGS(cpu, offset, curr, task),
11019 +
11020 +       TP_STRUCT__entry(
11021 +               __field(int,            cpu)
11022 +               __field(long long,      offset)
11023 +               __array(char,           ccomm,  TASK_COMM_LEN)
11024 +               __field(int,            cprio)
11025 +               __array(char,           tcomm,  TASK_COMM_LEN)
11026 +               __field(int,            tprio)
11027 +       ),
11028 +
11029 +       TP_fast_assign(
11030 +               __entry->cpu    = cpu;
11031 +               __entry->offset = offset;
11032 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
11033 +               __entry->cprio  = curr->prio;
11034 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
11035 +                       task != NULL ? TASK_COMM_LEN : 7);
11036 +               __entry->tprio  = task != NULL ? task->prio : -1;
11037 +       ),
11038 +
11039 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
11040 +               __entry->cpu, __entry->offset, __entry->ccomm,
11041 +               __entry->cprio, __entry->tcomm, __entry->tprio)
11042 +);
11043 +#endif
11044 +
11045 +#endif /* _TRACE_HIST_H */
11046 +
11047 +/* This part must be outside protection */
11048 +#include <trace/define_trace.h>
11049 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
11050 new file mode 100644
11051 index 000000000000..d3f2fbd560b1
11052 --- /dev/null
11053 +++ b/include/trace/events/latency_hist.h
11054 @@ -0,0 +1,29 @@
11055 +#ifndef _LATENCY_HIST_H
11056 +#define _LATENCY_HIST_H
11057 +
11058 +enum hist_action {
11059 +       IRQS_ON,
11060 +       PREEMPT_ON,
11061 +       TRACE_STOP,
11062 +       IRQS_OFF,
11063 +       PREEMPT_OFF,
11064 +       TRACE_START,
11065 +};
11066 +
11067 +static char *actions[] = {
11068 +       "IRQS_ON",
11069 +       "PREEMPT_ON",
11070 +       "TRACE_STOP",
11071 +       "IRQS_OFF",
11072 +       "PREEMPT_OFF",
11073 +       "TRACE_START",
11074 +};
11075 +
11076 +static inline char *getaction(int action)
11077 +{
11078 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
11079 +               return actions[action];
11080 +       return "unknown";
11081 +}
11082 +
11083 +#endif /* _LATENCY_HIST_H */
11084 diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
11085 index 9b90c57517a9..516ae88cddf4 100644
11086 --- a/include/trace/events/sched.h
11087 +++ b/include/trace/events/sched.h
11088 @@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
11089         TP_fast_assign(
11090                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
11091                 __entry->pid            = p->pid;
11092 -               __entry->prio           = p->prio;
11093 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
11094                 __entry->success        = 1; /* rudiment, kill when possible */
11095                 __entry->target_cpu     = task_cpu(p);
11096         ),
11097 @@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch,
11098                 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
11099                 __entry->next_pid       = next->pid;
11100                 __entry->next_prio      = next->prio;
11101 +               /* XXX SCHED_DEADLINE */
11102         ),
11103
11104         TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
11105 @@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task,
11106         TP_fast_assign(
11107                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
11108                 __entry->pid            = p->pid;
11109 -               __entry->prio           = p->prio;
11110 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
11111                 __entry->orig_cpu       = task_cpu(p);
11112                 __entry->dest_cpu       = dest_cpu;
11113         ),
11114 @@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
11115         TP_fast_assign(
11116                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
11117                 __entry->pid            = p->pid;
11118 -               __entry->prio           = p->prio;
11119 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
11120         ),
11121
11122         TP_printk("comm=%s pid=%d prio=%d",
11123 @@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait,
11124         TP_fast_assign(
11125                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
11126                 __entry->pid            = pid_nr(pid);
11127 -               __entry->prio           = current->prio;
11128 +               __entry->prio           = current->prio; /* XXX SCHED_DEADLINE */
11129         ),
11130
11131         TP_printk("comm=%s pid=%d prio=%d",
11132 @@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
11133   */
11134  TRACE_EVENT(sched_pi_setprio,
11135
11136 -       TP_PROTO(struct task_struct *tsk, int newprio),
11137 +       TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
11138
11139 -       TP_ARGS(tsk, newprio),
11140 +       TP_ARGS(tsk, pi_task),
11141
11142         TP_STRUCT__entry(
11143                 __array( char,  comm,   TASK_COMM_LEN   )
11144 @@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio,
11145                 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
11146                 __entry->pid            = tsk->pid;
11147                 __entry->oldprio        = tsk->prio;
11148 -               __entry->newprio        = newprio;
11149 +               __entry->newprio        = pi_task ? pi_task->prio : tsk->prio;
11150 +               /* XXX SCHED_DEADLINE bits missing */
11151         ),
11152
11153         TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
11154 diff --git a/init/Kconfig b/init/Kconfig
11155 index 34407f15e6d3..2ce33a32e65d 100644
11156 --- a/init/Kconfig
11157 +++ b/init/Kconfig
11158 @@ -506,7 +506,7 @@ config TINY_RCU
11159
11160  config RCU_EXPERT
11161         bool "Make expert-level adjustments to RCU configuration"
11162 -       default n
11163 +       default y if PREEMPT_RT_FULL
11164         help
11165           This option needs to be enabled if you wish to make
11166           expert-level adjustments to RCU configuration.  By default,
11167 @@ -623,7 +623,7 @@ config RCU_FANOUT_LEAF
11168
11169  config RCU_FAST_NO_HZ
11170         bool "Accelerate last non-dyntick-idle CPU's grace periods"
11171 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
11172 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
11173         default n
11174         help
11175           This option permits CPUs to enter dynticks-idle state even if
11176 @@ -650,7 +650,7 @@ config TREE_RCU_TRACE
11177  config RCU_BOOST
11178         bool "Enable RCU priority boosting"
11179         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
11180 -       default n
11181 +       default y if PREEMPT_RT_FULL
11182         help
11183           This option boosts the priority of preempted RCU readers that
11184           block the current preemptible RCU grace period for too long.
11185 @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
11186
11187  endchoice
11188
11189 -config RCU_EXPEDITE_BOOT
11190 -       bool
11191 -       default n
11192 -       help
11193 -         This option enables expedited grace periods at boot time,
11194 -         as if rcu_expedite_gp() had been invoked early in boot.
11195 -         The corresponding rcu_unexpedite_gp() is invoked from
11196 -         rcu_end_inkernel_boot(), which is intended to be invoked
11197 -         at the end of the kernel-only boot sequence, just before
11198 -         init is exec'ed.
11199 -
11200 -         Accept the default if unsure.
11201 -
11202  endmenu # "RCU Subsystem"
11203
11204  config BUILD_BIN2C
11205 @@ -1064,6 +1051,7 @@ config CFS_BANDWIDTH
11206  config RT_GROUP_SCHED
11207         bool "Group scheduling for SCHED_RR/FIFO"
11208         depends on CGROUP_SCHED
11209 +       depends on !PREEMPT_RT_FULL
11210         default n
11211         help
11212           This feature lets you explicitly allocate real CPU bandwidth
11213 @@ -1772,6 +1760,7 @@ choice
11214
11215  config SLAB
11216         bool "SLAB"
11217 +       depends on !PREEMPT_RT_FULL
11218         select HAVE_HARDENED_USERCOPY_ALLOCATOR
11219         help
11220           The regular slab allocator that is established and known to work
11221 @@ -1792,6 +1781,7 @@ config SLUB
11222  config SLOB
11223         depends on EXPERT
11224         bool "SLOB (Simple Allocator)"
11225 +       depends on !PREEMPT_RT_FULL
11226         help
11227            SLOB replaces the stock allocator with a drastically simpler
11228            allocator. SLOB is generally more space efficient but
11229 @@ -1810,7 +1800,7 @@ config SLAB_FREELIST_RANDOM
11230
11231  config SLUB_CPU_PARTIAL
11232         default y
11233 -       depends on SLUB && SMP
11234 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
11235         bool "SLUB per cpu partial cache"
11236         help
11237           Per cpu partial caches accellerate objects allocation and freeing
11238 diff --git a/init/Makefile b/init/Makefile
11239 index c4fb45525d08..821190dfaa75 100644
11240 --- a/init/Makefile
11241 +++ b/init/Makefile
11242 @@ -35,4 +35,4 @@ silent_chk_compile.h = :
11243  include/generated/compile.h: FORCE
11244         @$($(quiet)chk_compile.h)
11245         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
11246 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
11247 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
11248 diff --git a/init/main.c b/init/main.c
11249 index 25bac88bc66e..a4a61e7d2248 100644
11250 --- a/init/main.c
11251 +++ b/init/main.c
11252 @@ -506,6 +506,7 @@ asmlinkage __visible void __init start_kernel(void)
11253         setup_command_line(command_line);
11254         setup_nr_cpu_ids();
11255         setup_per_cpu_areas();
11256 +       softirq_early_init();
11257         boot_cpu_state_init();
11258         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
11259
11260 diff --git a/ipc/sem.c b/ipc/sem.c
11261 index 10b94bc59d4a..b8360eaacc7a 100644
11262 --- a/ipc/sem.c
11263 +++ b/ipc/sem.c
11264 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
11265  static void wake_up_sem_queue_prepare(struct list_head *pt,
11266                                 struct sem_queue *q, int error)
11267  {
11268 +#ifdef CONFIG_PREEMPT_RT_BASE
11269 +       struct task_struct *p = q->sleeper;
11270 +       get_task_struct(p);
11271 +       q->status = error;
11272 +       wake_up_process(p);
11273 +       put_task_struct(p);
11274 +#else
11275         if (list_empty(pt)) {
11276                 /*
11277                  * Hold preempt off so that we don't get preempted and have the
11278 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
11279         q->pid = error;
11280
11281         list_add_tail(&q->list, pt);
11282 +#endif
11283  }
11284
11285  /**
11286 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
11287   */
11288  static void wake_up_sem_queue_do(struct list_head *pt)
11289  {
11290 +#ifndef CONFIG_PREEMPT_RT_BASE
11291         struct sem_queue *q, *t;
11292         int did_something;
11293
11294 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
11295         }
11296         if (did_something)
11297                 preempt_enable();
11298 +#endif
11299  }
11300
11301  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
11302 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
11303 index ebdb0043203a..b9e6aa7e5aa6 100644
11304 --- a/kernel/Kconfig.locks
11305 +++ b/kernel/Kconfig.locks
11306 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
11307
11308  config MUTEX_SPIN_ON_OWNER
11309         def_bool y
11310 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
11311 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11312
11313  config RWSEM_SPIN_ON_OWNER
11314         def_bool y
11315 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
11316 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11317
11318  config LOCK_SPIN_ON_OWNER
11319         def_bool y
11320 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
11321 index 3f9c97419f02..11dbe26a8279 100644
11322 --- a/kernel/Kconfig.preempt
11323 +++ b/kernel/Kconfig.preempt
11324 @@ -1,3 +1,16 @@
11325 +config PREEMPT
11326 +       bool
11327 +       select PREEMPT_COUNT
11328 +
11329 +config PREEMPT_RT_BASE
11330 +       bool
11331 +       select PREEMPT
11332 +
11333 +config HAVE_PREEMPT_LAZY
11334 +       bool
11335 +
11336 +config PREEMPT_LAZY
11337 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
11338
11339  choice
11340         prompt "Preemption Model"
11341 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
11342
11343           Select this if you are building a kernel for a desktop system.
11344
11345 -config PREEMPT
11346 +config PREEMPT__LL
11347         bool "Preemptible Kernel (Low-Latency Desktop)"
11348 -       select PREEMPT_COUNT
11349 +       select PREEMPT
11350         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
11351         help
11352           This option reduces the latency of the kernel by making
11353 @@ -52,6 +65,22 @@ config PREEMPT
11354           embedded system with latency requirements in the milliseconds
11355           range.
11356
11357 +config PREEMPT_RTB
11358 +       bool "Preemptible Kernel (Basic RT)"
11359 +       select PREEMPT_RT_BASE
11360 +       help
11361 +         This option is basically the same as (Low-Latency Desktop) but
11362 +         enables changes which are preliminary for the full preemptible
11363 +         RT kernel.
11364 +
11365 +config PREEMPT_RT_FULL
11366 +       bool "Fully Preemptible Kernel (RT)"
11367 +       depends on IRQ_FORCED_THREADING
11368 +       select PREEMPT_RT_BASE
11369 +       select PREEMPT_RCU
11370 +       help
11371 +         All and everything
11372 +
11373  endchoice
11374
11375  config PREEMPT_COUNT
11376 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
11377 index 4c233437ee1a..6c3c9f298f22 100644
11378 --- a/kernel/cgroup.c
11379 +++ b/kernel/cgroup.c
11380 @@ -5041,10 +5041,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
11381         queue_work(cgroup_destroy_wq, &css->destroy_work);
11382  }
11383
11384 -static void css_release_work_fn(struct work_struct *work)
11385 +static void css_release_work_fn(struct swork_event *sev)
11386  {
11387         struct cgroup_subsys_state *css =
11388 -               container_of(work, struct cgroup_subsys_state, destroy_work);
11389 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
11390         struct cgroup_subsys *ss = css->ss;
11391         struct cgroup *cgrp = css->cgroup;
11392
11393 @@ -5087,8 +5087,8 @@ static void css_release(struct percpu_ref *ref)
11394         struct cgroup_subsys_state *css =
11395                 container_of(ref, struct cgroup_subsys_state, refcnt);
11396
11397 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
11398 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
11399 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
11400 +       swork_queue(&css->destroy_swork);
11401  }
11402
11403  static void init_and_link_css(struct cgroup_subsys_state *css,
11404 @@ -5749,6 +5749,7 @@ static int __init cgroup_wq_init(void)
11405          */
11406         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
11407         BUG_ON(!cgroup_destroy_wq);
11408 +       BUG_ON(swork_get());
11409
11410         /*
11411          * Used to destroy pidlists and separate to serve as flush domain.
11412 diff --git a/kernel/cpu.c b/kernel/cpu.c
11413 index 26a4f74bff83..010db3c943cd 100644
11414 --- a/kernel/cpu.c
11415 +++ b/kernel/cpu.c
11416 @@ -239,6 +239,289 @@ static struct {
11417  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
11418  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
11419
11420 +/**
11421 + * hotplug_pcp - per cpu hotplug descriptor
11422 + * @unplug:    set when pin_current_cpu() needs to sync tasks
11423 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
11424 + * @refcount:  counter of tasks in pinned sections
11425 + * @grab_lock: set when the tasks entering pinned sections should wait
11426 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
11427 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
11428 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
11429 + *
11430 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
11431 + * is used as a flag and still exists after @sync_tsk has exited and
11432 + * @sync_tsk set to NULL.
11433 + */
11434 +struct hotplug_pcp {
11435 +       struct task_struct *unplug;
11436 +       struct task_struct *sync_tsk;
11437 +       int refcount;
11438 +       int grab_lock;
11439 +       struct completion synced;
11440 +       struct completion unplug_wait;
11441 +#ifdef CONFIG_PREEMPT_RT_FULL
11442 +       /*
11443 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
11444 +        * the task, otherwise the mutex will cause the task to fail
11445 +        * to sleep when required. (Because it's called from migrate_disable())
11446 +        *
11447 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
11448 +        * state.
11449 +        */
11450 +       spinlock_t lock;
11451 +#else
11452 +       struct mutex mutex;
11453 +#endif
11454 +       int mutex_init;
11455 +};
11456 +
11457 +#ifdef CONFIG_PREEMPT_RT_FULL
11458 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
11459 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
11460 +#else
11461 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
11462 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
11463 +#endif
11464 +
11465 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
11466 +
11467 +/**
11468 + * pin_current_cpu - Prevent the current cpu from being unplugged
11469 + *
11470 + * Lightweight version of get_online_cpus() to prevent cpu from being
11471 + * unplugged when code runs in a migration disabled region.
11472 + *
11473 + * Must be called with preemption disabled (preempt_count = 1)!
11474 + */
11475 +void pin_current_cpu(void)
11476 +{
11477 +       struct hotplug_pcp *hp;
11478 +       int force = 0;
11479 +
11480 +retry:
11481 +       hp = this_cpu_ptr(&hotplug_pcp);
11482 +
11483 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
11484 +           hp->unplug == current) {
11485 +               hp->refcount++;
11486 +               return;
11487 +       }
11488 +       if (hp->grab_lock) {
11489 +               preempt_enable();
11490 +               hotplug_lock(hp);
11491 +               hotplug_unlock(hp);
11492 +       } else {
11493 +               preempt_enable();
11494 +               /*
11495 +                * Try to push this task off of this CPU.
11496 +                */
11497 +               if (!migrate_me()) {
11498 +                       preempt_disable();
11499 +                       hp = this_cpu_ptr(&hotplug_pcp);
11500 +                       if (!hp->grab_lock) {
11501 +                               /*
11502 +                                * Just let it continue it's already pinned
11503 +                                * or about to sleep.
11504 +                                */
11505 +                               force = 1;
11506 +                               goto retry;
11507 +                       }
11508 +                       preempt_enable();
11509 +               }
11510 +       }
11511 +       preempt_disable();
11512 +       goto retry;
11513 +}
11514 +
11515 +/**
11516 + * unpin_current_cpu - Allow unplug of current cpu
11517 + *
11518 + * Must be called with preemption or interrupts disabled!
11519 + */
11520 +void unpin_current_cpu(void)
11521 +{
11522 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
11523 +
11524 +       WARN_ON(hp->refcount <= 0);
11525 +
11526 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
11527 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
11528 +               wake_up_process(hp->unplug);
11529 +}
11530 +
11531 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
11532 +{
11533 +       set_current_state(TASK_UNINTERRUPTIBLE);
11534 +       while (hp->refcount) {
11535 +               schedule_preempt_disabled();
11536 +               set_current_state(TASK_UNINTERRUPTIBLE);
11537 +       }
11538 +}
11539 +
11540 +static int sync_unplug_thread(void *data)
11541 +{
11542 +       struct hotplug_pcp *hp = data;
11543 +
11544 +       wait_for_completion(&hp->unplug_wait);
11545 +       preempt_disable();
11546 +       hp->unplug = current;
11547 +       wait_for_pinned_cpus(hp);
11548 +
11549 +       /*
11550 +        * This thread will synchronize the cpu_down() with threads
11551 +        * that have pinned the CPU. When the pinned CPU count reaches
11552 +        * zero, we inform the cpu_down code to continue to the next step.
11553 +        */
11554 +       set_current_state(TASK_UNINTERRUPTIBLE);
11555 +       preempt_enable();
11556 +       complete(&hp->synced);
11557 +
11558 +       /*
11559 +        * If all succeeds, the next step will need tasks to wait till
11560 +        * the CPU is offline before continuing. To do this, the grab_lock
11561 +        * is set and tasks going into pin_current_cpu() will block on the
11562 +        * mutex. But we still need to wait for those that are already in
11563 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
11564 +        * will kick this thread out.
11565 +        */
11566 +       while (!hp->grab_lock && !kthread_should_stop()) {
11567 +               schedule();
11568 +               set_current_state(TASK_UNINTERRUPTIBLE);
11569 +       }
11570 +
11571 +       /* Make sure grab_lock is seen before we see a stale completion */
11572 +       smp_mb();
11573 +
11574 +       /*
11575 +        * Now just before cpu_down() enters stop machine, we need to make
11576 +        * sure all tasks that are in pinned CPU sections are out, and new
11577 +        * tasks will now grab the lock, keeping them from entering pinned
11578 +        * CPU sections.
11579 +        */
11580 +       if (!kthread_should_stop()) {
11581 +               preempt_disable();
11582 +               wait_for_pinned_cpus(hp);
11583 +               preempt_enable();
11584 +               complete(&hp->synced);
11585 +       }
11586 +
11587 +       set_current_state(TASK_UNINTERRUPTIBLE);
11588 +       while (!kthread_should_stop()) {
11589 +               schedule();
11590 +               set_current_state(TASK_UNINTERRUPTIBLE);
11591 +       }
11592 +       set_current_state(TASK_RUNNING);
11593 +
11594 +       /*
11595 +        * Force this thread off this CPU as it's going down and
11596 +        * we don't want any more work on this CPU.
11597 +        */
11598 +       current->flags &= ~PF_NO_SETAFFINITY;
11599 +       set_cpus_allowed_ptr(current, cpu_present_mask);
11600 +       migrate_me();
11601 +       return 0;
11602 +}
11603 +
11604 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
11605 +{
11606 +       wake_up_process(hp->sync_tsk);
11607 +       wait_for_completion(&hp->synced);
11608 +}
11609 +
11610 +static void __cpu_unplug_wait(unsigned int cpu)
11611 +{
11612 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11613 +
11614 +       complete(&hp->unplug_wait);
11615 +       wait_for_completion(&hp->synced);
11616 +}
11617 +
11618 +/*
11619 + * Start the sync_unplug_thread on the target cpu and wait for it to
11620 + * complete.
11621 + */
11622 +static int cpu_unplug_begin(unsigned int cpu)
11623 +{
11624 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11625 +       int err;
11626 +
11627 +       /* Protected by cpu_hotplug.lock */
11628 +       if (!hp->mutex_init) {
11629 +#ifdef CONFIG_PREEMPT_RT_FULL
11630 +               spin_lock_init(&hp->lock);
11631 +#else
11632 +               mutex_init(&hp->mutex);
11633 +#endif
11634 +               hp->mutex_init = 1;
11635 +       }
11636 +
11637 +       /* Inform the scheduler to migrate tasks off this CPU */
11638 +       tell_sched_cpu_down_begin(cpu);
11639 +
11640 +       init_completion(&hp->synced);
11641 +       init_completion(&hp->unplug_wait);
11642 +
11643 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
11644 +       if (IS_ERR(hp->sync_tsk)) {
11645 +               err = PTR_ERR(hp->sync_tsk);
11646 +               hp->sync_tsk = NULL;
11647 +               return err;
11648 +       }
11649 +       kthread_bind(hp->sync_tsk, cpu);
11650 +
11651 +       /*
11652 +        * Wait for tasks to get out of the pinned sections,
11653 +        * it's still OK if new tasks enter. Some CPU notifiers will
11654 +        * wait for tasks that are going to enter these sections and
11655 +        * we must not have them block.
11656 +        */
11657 +       wake_up_process(hp->sync_tsk);
11658 +       return 0;
11659 +}
11660 +
11661 +static void cpu_unplug_sync(unsigned int cpu)
11662 +{
11663 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11664 +
11665 +       init_completion(&hp->synced);
11666 +       /* The completion needs to be initialzied before setting grab_lock */
11667 +       smp_wmb();
11668 +
11669 +       /* Grab the mutex before setting grab_lock */
11670 +       hotplug_lock(hp);
11671 +       hp->grab_lock = 1;
11672 +
11673 +       /*
11674 +        * The CPU notifiers have been completed.
11675 +        * Wait for tasks to get out of pinned CPU sections and have new
11676 +        * tasks block until the CPU is completely down.
11677 +        */
11678 +       __cpu_unplug_sync(hp);
11679 +
11680 +       /* All done with the sync thread */
11681 +       kthread_stop(hp->sync_tsk);
11682 +       hp->sync_tsk = NULL;
11683 +}
11684 +
11685 +static void cpu_unplug_done(unsigned int cpu)
11686 +{
11687 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11688 +
11689 +       hp->unplug = NULL;
11690 +       /* Let all tasks know cpu unplug is finished before cleaning up */
11691 +       smp_wmb();
11692 +
11693 +       if (hp->sync_tsk)
11694 +               kthread_stop(hp->sync_tsk);
11695 +
11696 +       if (hp->grab_lock) {
11697 +               hotplug_unlock(hp);
11698 +               /* protected by cpu_hotplug.lock */
11699 +               hp->grab_lock = 0;
11700 +       }
11701 +       tell_sched_cpu_down_done(cpu);
11702 +}
11703
11704  void get_online_cpus(void)
11705  {
11706 @@ -802,10 +1085,14 @@ static int takedown_cpu(unsigned int cpu)
11707         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11708         int err;
11709
11710 +       __cpu_unplug_wait(cpu);
11711         /* Park the smpboot threads */
11712         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
11713         smpboot_park_threads(cpu);
11714
11715 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
11716 +       cpu_unplug_sync(cpu);
11717 +
11718         /*
11719          * Prevent irq alloc/free while the dying cpu reorganizes the
11720          * interrupt affinities.
11721 @@ -890,6 +1177,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11722         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11723         int prev_state, ret = 0;
11724         bool hasdied = false;
11725 +       int mycpu;
11726 +       cpumask_var_t cpumask;
11727 +       cpumask_var_t cpumask_org;
11728
11729         if (num_online_cpus() == 1)
11730                 return -EBUSY;
11731 @@ -897,7 +1187,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11732         if (!cpu_present(cpu))
11733                 return -EINVAL;
11734
11735 +       /* Move the downtaker off the unplug cpu */
11736 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11737 +               return -ENOMEM;
11738 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11739 +               free_cpumask_var(cpumask);
11740 +               return -ENOMEM;
11741 +       }
11742 +
11743 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11744 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11745 +       set_cpus_allowed_ptr(current, cpumask);
11746 +       free_cpumask_var(cpumask);
11747 +       migrate_disable();
11748 +       mycpu = smp_processor_id();
11749 +       if (mycpu == cpu) {
11750 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11751 +               migrate_enable();
11752 +               ret = -EBUSY;
11753 +               goto restore_cpus;
11754 +       }
11755 +
11756 +       migrate_enable();
11757         cpu_hotplug_begin();
11758 +       ret = cpu_unplug_begin(cpu);
11759 +       if (ret) {
11760 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11761 +               goto out_cancel;
11762 +       }
11763
11764         cpuhp_tasks_frozen = tasks_frozen;
11765
11766 @@ -936,10 +1253,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11767
11768         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11769  out:
11770 +       cpu_unplug_done(cpu);
11771 +out_cancel:
11772         cpu_hotplug_done();
11773         /* This post dead nonsense must die */
11774         if (!ret && hasdied)
11775                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11776 +restore_cpus:
11777 +       set_cpus_allowed_ptr(current, cpumask_org);
11778 +       free_cpumask_var(cpumask_org);
11779         return ret;
11780  }
11781
11782 @@ -1242,6 +1564,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
11783
11784  #endif /* CONFIG_PM_SLEEP_SMP */
11785
11786 +int __boot_cpu_id;
11787 +
11788  #endif /* CONFIG_SMP */
11789
11790  /* Boot processor state steps */
11791 @@ -1926,6 +2250,10 @@ void __init boot_cpu_init(void)
11792         set_cpu_active(cpu, true);
11793         set_cpu_present(cpu, true);
11794         set_cpu_possible(cpu, true);
11795 +
11796 +#ifdef CONFIG_SMP
11797 +       __boot_cpu_id = cpu;
11798 +#endif
11799  }
11800
11801  /*
11802 diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
11803 index 009cc9a17d95..67b02e138a47 100644
11804 --- a/kernel/cpu_pm.c
11805 +++ b/kernel/cpu_pm.c
11806 @@ -22,15 +22,21 @@
11807  #include <linux/spinlock.h>
11808  #include <linux/syscore_ops.h>
11809
11810 -static DEFINE_RWLOCK(cpu_pm_notifier_lock);
11811 -static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
11812 +static ATOMIC_NOTIFIER_HEAD(cpu_pm_notifier_chain);
11813
11814  static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
11815  {
11816         int ret;
11817
11818 -       ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
11819 +       /*
11820 +        * __atomic_notifier_call_chain has a RCU read critical section, which
11821 +        * could be disfunctional in cpu idle. Copy RCU_NONIDLE code to let
11822 +        * RCU know this.
11823 +        */
11824 +       rcu_irq_enter_irqson();
11825 +       ret = __atomic_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
11826                 nr_to_call, nr_calls);
11827 +       rcu_irq_exit_irqson();
11828
11829         return notifier_to_errno(ret);
11830  }
11831 @@ -47,14 +53,7 @@ static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
11832   */
11833  int cpu_pm_register_notifier(struct notifier_block *nb)
11834  {
11835 -       unsigned long flags;
11836 -       int ret;
11837 -
11838 -       write_lock_irqsave(&cpu_pm_notifier_lock, flags);
11839 -       ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
11840 -       write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
11841 -
11842 -       return ret;
11843 +       return atomic_notifier_chain_register(&cpu_pm_notifier_chain, nb);
11844  }
11845  EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
11846
11847 @@ -69,14 +68,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
11848   */
11849  int cpu_pm_unregister_notifier(struct notifier_block *nb)
11850  {
11851 -       unsigned long flags;
11852 -       int ret;
11853 -
11854 -       write_lock_irqsave(&cpu_pm_notifier_lock, flags);
11855 -       ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
11856 -       write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
11857 -
11858 -       return ret;
11859 +       return atomic_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
11860  }
11861  EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
11862
11863 @@ -100,7 +92,6 @@ int cpu_pm_enter(void)
11864         int nr_calls;
11865         int ret = 0;
11866
11867 -       read_lock(&cpu_pm_notifier_lock);
11868         ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
11869         if (ret)
11870                 /*
11871 @@ -108,7 +99,6 @@ int cpu_pm_enter(void)
11872                  * PM entry who are notified earlier to prepare for it.
11873                  */
11874                 cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
11875 -       read_unlock(&cpu_pm_notifier_lock);
11876
11877         return ret;
11878  }
11879 @@ -128,13 +118,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_enter);
11880   */
11881  int cpu_pm_exit(void)
11882  {
11883 -       int ret;
11884 -
11885 -       read_lock(&cpu_pm_notifier_lock);
11886 -       ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
11887 -       read_unlock(&cpu_pm_notifier_lock);
11888 -
11889 -       return ret;
11890 +       return cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
11891  }
11892  EXPORT_SYMBOL_GPL(cpu_pm_exit);
11893
11894 @@ -159,7 +143,6 @@ int cpu_cluster_pm_enter(void)
11895         int nr_calls;
11896         int ret = 0;
11897
11898 -       read_lock(&cpu_pm_notifier_lock);
11899         ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
11900         if (ret)
11901                 /*
11902 @@ -167,7 +150,6 @@ int cpu_cluster_pm_enter(void)
11903                  * PM entry who are notified earlier to prepare for it.
11904                  */
11905                 cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
11906 -       read_unlock(&cpu_pm_notifier_lock);
11907
11908         return ret;
11909  }
11910 @@ -190,13 +172,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
11911   */
11912  int cpu_cluster_pm_exit(void)
11913  {
11914 -       int ret;
11915 -
11916 -       read_lock(&cpu_pm_notifier_lock);
11917 -       ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
11918 -       read_unlock(&cpu_pm_notifier_lock);
11919 -
11920 -       return ret;
11921 +       return cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
11922  }
11923  EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
11924
11925 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
11926 index 511b1dd8ff09..1dd63833ecdc 100644
11927 --- a/kernel/cpuset.c
11928 +++ b/kernel/cpuset.c
11929 @@ -285,7 +285,7 @@ static struct cpuset top_cpuset = {
11930   */
11931
11932  static DEFINE_MUTEX(cpuset_mutex);
11933 -static DEFINE_SPINLOCK(callback_lock);
11934 +static DEFINE_RAW_SPINLOCK(callback_lock);
11935
11936  static struct workqueue_struct *cpuset_migrate_mm_wq;
11937
11938 @@ -908,9 +908,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
11939                         continue;
11940                 rcu_read_unlock();
11941
11942 -               spin_lock_irq(&callback_lock);
11943 +               raw_spin_lock_irq(&callback_lock);
11944                 cpumask_copy(cp->effective_cpus, new_cpus);
11945 -               spin_unlock_irq(&callback_lock);
11946 +               raw_spin_unlock_irq(&callback_lock);
11947
11948                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11949                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
11950 @@ -975,9 +975,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
11951         if (retval < 0)
11952                 return retval;
11953
11954 -       spin_lock_irq(&callback_lock);
11955 +       raw_spin_lock_irq(&callback_lock);
11956         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
11957 -       spin_unlock_irq(&callback_lock);
11958 +       raw_spin_unlock_irq(&callback_lock);
11959
11960         /* use trialcs->cpus_allowed as a temp variable */
11961         update_cpumasks_hier(cs, trialcs->cpus_allowed);
11962 @@ -1177,9 +1177,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
11963                         continue;
11964                 rcu_read_unlock();
11965
11966 -               spin_lock_irq(&callback_lock);
11967 +               raw_spin_lock_irq(&callback_lock);
11968                 cp->effective_mems = *new_mems;
11969 -               spin_unlock_irq(&callback_lock);
11970 +               raw_spin_unlock_irq(&callback_lock);
11971
11972                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11973                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
11974 @@ -1247,9 +1247,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
11975         if (retval < 0)
11976                 goto done;
11977
11978 -       spin_lock_irq(&callback_lock);
11979 +       raw_spin_lock_irq(&callback_lock);
11980         cs->mems_allowed = trialcs->mems_allowed;
11981 -       spin_unlock_irq(&callback_lock);
11982 +       raw_spin_unlock_irq(&callback_lock);
11983
11984         /* use trialcs->mems_allowed as a temp variable */
11985         update_nodemasks_hier(cs, &trialcs->mems_allowed);
11986 @@ -1340,9 +1340,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
11987         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
11988                         || (is_spread_page(cs) != is_spread_page(trialcs)));
11989
11990 -       spin_lock_irq(&callback_lock);
11991 +       raw_spin_lock_irq(&callback_lock);
11992         cs->flags = trialcs->flags;
11993 -       spin_unlock_irq(&callback_lock);
11994 +       raw_spin_unlock_irq(&callback_lock);
11995
11996         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
11997                 rebuild_sched_domains_locked();
11998 @@ -1757,7 +1757,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11999         cpuset_filetype_t type = seq_cft(sf)->private;
12000         int ret = 0;
12001
12002 -       spin_lock_irq(&callback_lock);
12003 +       raw_spin_lock_irq(&callback_lock);
12004
12005         switch (type) {
12006         case FILE_CPULIST:
12007 @@ -1776,7 +1776,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
12008                 ret = -EINVAL;
12009         }
12010
12011 -       spin_unlock_irq(&callback_lock);
12012 +       raw_spin_unlock_irq(&callback_lock);
12013         return ret;
12014  }
12015
12016 @@ -1991,12 +1991,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
12017
12018         cpuset_inc();
12019
12020 -       spin_lock_irq(&callback_lock);
12021 +       raw_spin_lock_irq(&callback_lock);
12022         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
12023                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
12024                 cs->effective_mems = parent->effective_mems;
12025         }
12026 -       spin_unlock_irq(&callback_lock);
12027 +       raw_spin_unlock_irq(&callback_lock);
12028
12029         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
12030                 goto out_unlock;
12031 @@ -2023,12 +2023,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
12032         }
12033         rcu_read_unlock();
12034
12035 -       spin_lock_irq(&callback_lock);
12036 +       raw_spin_lock_irq(&callback_lock);
12037         cs->mems_allowed = parent->mems_allowed;
12038         cs->effective_mems = parent->mems_allowed;
12039         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
12040         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
12041 -       spin_unlock_irq(&callback_lock);
12042 +       raw_spin_unlock_irq(&callback_lock);
12043  out_unlock:
12044         mutex_unlock(&cpuset_mutex);
12045         return 0;
12046 @@ -2067,7 +2067,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
12047  static void cpuset_bind(struct cgroup_subsys_state *root_css)
12048  {
12049         mutex_lock(&cpuset_mutex);
12050 -       spin_lock_irq(&callback_lock);
12051 +       raw_spin_lock_irq(&callback_lock);
12052
12053         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
12054                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
12055 @@ -2078,7 +2078,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
12056                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
12057         }
12058
12059 -       spin_unlock_irq(&callback_lock);
12060 +       raw_spin_unlock_irq(&callback_lock);
12061         mutex_unlock(&cpuset_mutex);
12062  }
12063
12064 @@ -2179,12 +2179,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
12065  {
12066         bool is_empty;
12067
12068 -       spin_lock_irq(&callback_lock);
12069 +       raw_spin_lock_irq(&callback_lock);
12070         cpumask_copy(cs->cpus_allowed, new_cpus);
12071         cpumask_copy(cs->effective_cpus, new_cpus);
12072         cs->mems_allowed = *new_mems;
12073         cs->effective_mems = *new_mems;
12074 -       spin_unlock_irq(&callback_lock);
12075 +       raw_spin_unlock_irq(&callback_lock);
12076
12077         /*
12078          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
12079 @@ -2221,10 +2221,10 @@ hotplug_update_tasks(struct cpuset *cs,
12080         if (nodes_empty(*new_mems))
12081                 *new_mems = parent_cs(cs)->effective_mems;
12082
12083 -       spin_lock_irq(&callback_lock);
12084 +       raw_spin_lock_irq(&callback_lock);
12085         cpumask_copy(cs->effective_cpus, new_cpus);
12086         cs->effective_mems = *new_mems;
12087 -       spin_unlock_irq(&callback_lock);
12088 +       raw_spin_unlock_irq(&callback_lock);
12089
12090         if (cpus_updated)
12091                 update_tasks_cpumask(cs);
12092 @@ -2317,21 +2317,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
12093
12094         /* synchronize cpus_allowed to cpu_active_mask */
12095         if (cpus_updated) {
12096 -               spin_lock_irq(&callback_lock);
12097 +               raw_spin_lock_irq(&callback_lock);
12098                 if (!on_dfl)
12099                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
12100                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
12101 -               spin_unlock_irq(&callback_lock);
12102 +               raw_spin_unlock_irq(&callback_lock);
12103                 /* we don't mess with cpumasks of tasks in top_cpuset */
12104         }
12105
12106         /* synchronize mems_allowed to N_MEMORY */
12107         if (mems_updated) {
12108 -               spin_lock_irq(&callback_lock);
12109 +               raw_spin_lock_irq(&callback_lock);
12110                 if (!on_dfl)
12111                         top_cpuset.mems_allowed = new_mems;
12112                 top_cpuset.effective_mems = new_mems;
12113 -               spin_unlock_irq(&callback_lock);
12114 +               raw_spin_unlock_irq(&callback_lock);
12115                 update_tasks_nodemask(&top_cpuset);
12116         }
12117
12118 @@ -2436,11 +2436,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
12119  {
12120         unsigned long flags;
12121
12122 -       spin_lock_irqsave(&callback_lock, flags);
12123 +       raw_spin_lock_irqsave(&callback_lock, flags);
12124         rcu_read_lock();
12125         guarantee_online_cpus(task_cs(tsk), pmask);
12126         rcu_read_unlock();
12127 -       spin_unlock_irqrestore(&callback_lock, flags);
12128 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
12129  }
12130
12131  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
12132 @@ -2488,11 +2488,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
12133         nodemask_t mask;
12134         unsigned long flags;
12135
12136 -       spin_lock_irqsave(&callback_lock, flags);
12137 +       raw_spin_lock_irqsave(&callback_lock, flags);
12138         rcu_read_lock();
12139         guarantee_online_mems(task_cs(tsk), &mask);
12140         rcu_read_unlock();
12141 -       spin_unlock_irqrestore(&callback_lock, flags);
12142 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
12143
12144         return mask;
12145  }
12146 @@ -2584,14 +2584,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
12147                 return true;
12148
12149         /* Not hardwall and node outside mems_allowed: scan up cpusets */
12150 -       spin_lock_irqsave(&callback_lock, flags);
12151 +       raw_spin_lock_irqsave(&callback_lock, flags);
12152
12153         rcu_read_lock();
12154         cs = nearest_hardwall_ancestor(task_cs(current));
12155         allowed = node_isset(node, cs->mems_allowed);
12156         rcu_read_unlock();
12157
12158 -       spin_unlock_irqrestore(&callback_lock, flags);
12159 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
12160         return allowed;
12161  }
12162
12163 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
12164 index fc1ef736253c..83c666537a7a 100644
12165 --- a/kernel/debug/kdb/kdb_io.c
12166 +++ b/kernel/debug/kdb/kdb_io.c
12167 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12168         int linecount;
12169         int colcount;
12170         int logging, saved_loglevel = 0;
12171 -       int saved_trap_printk;
12172         int got_printf_lock = 0;
12173         int retlen = 0;
12174         int fnd, len;
12175 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12176         unsigned long uninitialized_var(flags);
12177
12178         preempt_disable();
12179 -       saved_trap_printk = kdb_trap_printk;
12180 -       kdb_trap_printk = 0;
12181
12182         /* Serialize kdb_printf if multiple cpus try to write at once.
12183          * But if any cpu goes recursive in kdb, just print the output,
12184 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12185         } else {
12186                 __release(kdb_printf_lock);
12187         }
12188 -       kdb_trap_printk = saved_trap_printk;
12189         preempt_enable();
12190         return retlen;
12191  }
12192 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
12193         va_list ap;
12194         int r;
12195
12196 +       kdb_trap_printk++;
12197         va_start(ap, fmt);
12198         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
12199         va_end(ap);
12200 +       kdb_trap_printk--;
12201
12202         return r;
12203  }
12204 diff --git a/kernel/events/core.c b/kernel/events/core.c
12205 index 13b9784427b0..f74fbfe5465c 100644
12206 --- a/kernel/events/core.c
12207 +++ b/kernel/events/core.c
12208 @@ -1050,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
12209         raw_spin_lock_init(&cpuctx->hrtimer_lock);
12210         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
12211         timer->function = perf_mux_hrtimer_handler;
12212 +       timer->irqsafe = 1;
12213  }
12214
12215  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
12216 @@ -8405,6 +8406,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
12217
12218         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
12219         hwc->hrtimer.function = perf_swevent_hrtimer;
12220 +       hwc->hrtimer.irqsafe = 1;
12221
12222         /*
12223          * Since hrtimers have a fixed rate, we can do a static freq->period
12224 diff --git a/kernel/exit.c b/kernel/exit.c
12225 index 3076f3089919..fb2ebcf3ca7c 100644
12226 --- a/kernel/exit.c
12227 +++ b/kernel/exit.c
12228 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
12229          * Do this under ->siglock, we can race with another thread
12230          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
12231          */
12232 -       flush_sigqueue(&tsk->pending);
12233 +       flush_task_sigqueue(tsk);
12234         tsk->sighand = NULL;
12235         spin_unlock(&sighand->siglock);
12236
12237 diff --git a/kernel/fork.c b/kernel/fork.c
12238 index 9321b1ad3335..276acd8acf0a 100644
12239 --- a/kernel/fork.c
12240 +++ b/kernel/fork.c
12241 @@ -76,6 +76,7 @@
12242  #include <linux/compiler.h>
12243  #include <linux/sysctl.h>
12244  #include <linux/kcov.h>
12245 +#include <linux/kprobes.h>
12246
12247  #include <asm/pgtable.h>
12248  #include <asm/pgalloc.h>
12249 @@ -376,13 +377,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
12250         if (atomic_dec_and_test(&sig->sigcnt))
12251                 free_signal_struct(sig);
12252  }
12253 -
12254 +#ifdef CONFIG_PREEMPT_RT_BASE
12255 +static
12256 +#endif
12257  void __put_task_struct(struct task_struct *tsk)
12258  {
12259         WARN_ON(!tsk->exit_state);
12260         WARN_ON(atomic_read(&tsk->usage));
12261         WARN_ON(tsk == current);
12262
12263 +       /*
12264 +        * Remove function-return probe instances associated with this
12265 +        * task and put them back on the free list.
12266 +        */
12267 +       kprobe_flush_task(tsk);
12268 +
12269 +       /* Task is done with its stack. */
12270 +       put_task_stack(tsk);
12271 +
12272         cgroup_free(tsk);
12273         task_numa_free(tsk);
12274         security_task_free(tsk);
12275 @@ -393,7 +405,18 @@ void __put_task_struct(struct task_struct *tsk)
12276         if (!profile_handoff_task(tsk))
12277                 free_task(tsk);
12278  }
12279 +#ifndef CONFIG_PREEMPT_RT_BASE
12280  EXPORT_SYMBOL_GPL(__put_task_struct);
12281 +#else
12282 +void __put_task_struct_cb(struct rcu_head *rhp)
12283 +{
12284 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
12285 +
12286 +       __put_task_struct(tsk);
12287 +
12288 +}
12289 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
12290 +#endif
12291
12292  void __init __weak arch_task_cache_init(void) { }
12293
12294 @@ -535,6 +558,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
12295         tsk->splice_pipe = NULL;
12296         tsk->task_frag.page = NULL;
12297         tsk->wake_q.next = NULL;
12298 +       tsk->wake_q_sleeper.next = NULL;
12299
12300         account_kernel_stack(tsk, 1);
12301
12302 @@ -861,6 +885,19 @@ void __mmdrop(struct mm_struct *mm)
12303  }
12304  EXPORT_SYMBOL_GPL(__mmdrop);
12305
12306 +#ifdef CONFIG_PREEMPT_RT_BASE
12307 +/*
12308 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
12309 + * want another facility to make this work.
12310 + */
12311 +void __mmdrop_delayed(struct rcu_head *rhp)
12312 +{
12313 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
12314 +
12315 +       __mmdrop(mm);
12316 +}
12317 +#endif
12318 +
12319  static inline void __mmput(struct mm_struct *mm)
12320  {
12321         VM_BUG_ON(atomic_read(&mm->mm_users));
12322 @@ -1426,6 +1463,7 @@ static void rt_mutex_init_task(struct task_struct *p)
12323  #ifdef CONFIG_RT_MUTEXES
12324         p->pi_waiters = RB_ROOT;
12325         p->pi_waiters_leftmost = NULL;
12326 +       p->pi_top_task = NULL;
12327         p->pi_blocked_on = NULL;
12328  #endif
12329  }
12330 @@ -1435,6 +1473,9 @@ static void rt_mutex_init_task(struct task_struct *p)
12331   */
12332  static void posix_cpu_timers_init(struct task_struct *tsk)
12333  {
12334 +#ifdef CONFIG_PREEMPT_RT_BASE
12335 +       tsk->posix_timer_list = NULL;
12336 +#endif
12337         tsk->cputime_expires.prof_exp = 0;
12338         tsk->cputime_expires.virt_exp = 0;
12339         tsk->cputime_expires.sched_exp = 0;
12340 @@ -1561,6 +1602,7 @@ static __latent_entropy struct task_struct *copy_process(
12341         spin_lock_init(&p->alloc_lock);
12342
12343         init_sigpending(&p->pending);
12344 +       p->sigqueue_cache = NULL;
12345
12346         p->utime = p->stime = p->gtime = 0;
12347         p->utimescaled = p->stimescaled = 0;
12348 diff --git a/kernel/futex.c b/kernel/futex.c
12349 index 88bad86180ac..2e074d63e8fa 100644
12350 --- a/kernel/futex.c
12351 +++ b/kernel/futex.c
12352 @@ -801,7 +801,7 @@ static int refill_pi_state_cache(void)
12353         return 0;
12354  }
12355
12356 -static struct futex_pi_state * alloc_pi_state(void)
12357 +static struct futex_pi_state *alloc_pi_state(void)
12358  {
12359         struct futex_pi_state *pi_state = current->pi_state_cache;
12360
12361 @@ -811,6 +811,11 @@ static struct futex_pi_state * alloc_pi_state(void)
12362         return pi_state;
12363  }
12364
12365 +static void get_pi_state(struct futex_pi_state *pi_state)
12366 +{
12367 +       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
12368 +}
12369 +
12370  /*
12371   * Drops a reference to the pi_state object and frees or caches it
12372   * when the last reference is gone.
12373 @@ -855,7 +860,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
12374   * Look up the task based on what TID userspace gave us.
12375   * We dont trust it.
12376   */
12377 -static struct task_struct * futex_find_get_task(pid_t pid)
12378 +static struct task_struct *futex_find_get_task(pid_t pid)
12379  {
12380         struct task_struct *p;
12381
12382 @@ -905,7 +910,9 @@ void exit_pi_state_list(struct task_struct *curr)
12383                  * task still owns the PI-state:
12384                  */
12385                 if (head->next != next) {
12386 +                       raw_spin_unlock_irq(&curr->pi_lock);
12387                         spin_unlock(&hb->lock);
12388 +                       raw_spin_lock_irq(&curr->pi_lock);
12389                         continue;
12390                 }
12391
12392 @@ -915,10 +922,12 @@ void exit_pi_state_list(struct task_struct *curr)
12393                 pi_state->owner = NULL;
12394                 raw_spin_unlock_irq(&curr->pi_lock);
12395
12396 -               rt_mutex_unlock(&pi_state->pi_mutex);
12397 -
12398 +               get_pi_state(pi_state);
12399                 spin_unlock(&hb->lock);
12400
12401 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12402 +               put_pi_state(pi_state);
12403 +
12404                 raw_spin_lock_irq(&curr->pi_lock);
12405         }
12406         raw_spin_unlock_irq(&curr->pi_lock);
12407 @@ -972,6 +981,39 @@ void exit_pi_state_list(struct task_struct *curr)
12408   *
12409   * [10] There is no transient state which leaves owner and user space
12410   *     TID out of sync.
12411 + *
12412 + *
12413 + * Serialization and lifetime rules:
12414 + *
12415 + * hb->lock:
12416 + *
12417 + *     hb -> futex_q, relation
12418 + *     futex_q -> pi_state, relation
12419 + *
12420 + *     (cannot be raw because hb can contain arbitrary amount
12421 + *      of futex_q's)
12422 + *
12423 + * pi_mutex->wait_lock:
12424 + *
12425 + *     {uval, pi_state}
12426 + *
12427 + *     (and pi_mutex 'obviously')
12428 + *
12429 + * p->pi_lock:
12430 + *
12431 + *     p->pi_state_list -> pi_state->list, relation
12432 + *
12433 + * pi_state->refcount:
12434 + *
12435 + *     pi_state lifetime
12436 + *
12437 + *
12438 + * Lock order:
12439 + *
12440 + *   hb->lock
12441 + *     pi_mutex->wait_lock
12442 + *       p->pi_lock
12443 + *
12444   */
12445
12446  /*
12447 @@ -979,10 +1021,13 @@ void exit_pi_state_list(struct task_struct *curr)
12448   * the pi_state against the user space value. If correct, attach to
12449   * it.
12450   */
12451 -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12452 +static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
12453 +                             struct futex_pi_state *pi_state,
12454                               struct futex_pi_state **ps)
12455  {
12456         pid_t pid = uval & FUTEX_TID_MASK;
12457 +       u32 uval2;
12458 +       int ret;
12459
12460         /*
12461          * Userspace might have messed up non-PI and PI futexes [3]
12462 @@ -990,9 +1035,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12463         if (unlikely(!pi_state))
12464                 return -EINVAL;
12465
12466 +       /*
12467 +        * We get here with hb->lock held, and having found a
12468 +        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
12469 +        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
12470 +        * which in turn means that futex_lock_pi() still has a reference on
12471 +        * our pi_state.
12472 +        *
12473 +        * The waiter holding a reference on @pi_state also protects against
12474 +        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
12475 +        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
12476 +        * free pi_state before we can take a reference ourselves.
12477 +        */
12478         WARN_ON(!atomic_read(&pi_state->refcount));
12479
12480         /*
12481 +        * Now that we have a pi_state, we can acquire wait_lock
12482 +        * and do the state validation.
12483 +        */
12484 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12485 +
12486 +       /*
12487 +        * Since {uval, pi_state} is serialized by wait_lock, and our current
12488 +        * uval was read without holding it, it can have changed. Verify it
12489 +        * still is what we expect it to be, otherwise retry the entire
12490 +        * operation.
12491 +        */
12492 +       if (get_futex_value_locked(&uval2, uaddr))
12493 +               goto out_efault;
12494 +
12495 +       if (uval != uval2)
12496 +               goto out_eagain;
12497 +
12498 +       /*
12499          * Handle the owner died case:
12500          */
12501         if (uval & FUTEX_OWNER_DIED) {
12502 @@ -1007,11 +1082,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12503                          * is not 0. Inconsistent state. [5]
12504                          */
12505                         if (pid)
12506 -                               return -EINVAL;
12507 +                               goto out_einval;
12508                         /*
12509                          * Take a ref on the state and return success. [4]
12510                          */
12511 -                       goto out_state;
12512 +                       goto out_attach;
12513                 }
12514
12515                 /*
12516 @@ -1023,14 +1098,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12517                  * Take a ref on the state and return success. [6]
12518                  */
12519                 if (!pid)
12520 -                       goto out_state;
12521 +                       goto out_attach;
12522         } else {
12523                 /*
12524                  * If the owner died bit is not set, then the pi_state
12525                  * must have an owner. [7]
12526                  */
12527                 if (!pi_state->owner)
12528 -                       return -EINVAL;
12529 +                       goto out_einval;
12530         }
12531
12532         /*
12533 @@ -1039,11 +1114,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12534          * user space TID. [9/10]
12535          */
12536         if (pid != task_pid_vnr(pi_state->owner))
12537 -               return -EINVAL;
12538 -out_state:
12539 -       atomic_inc(&pi_state->refcount);
12540 +               goto out_einval;
12541 +
12542 +out_attach:
12543 +       get_pi_state(pi_state);
12544 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12545         *ps = pi_state;
12546         return 0;
12547 +
12548 +out_einval:
12549 +       ret = -EINVAL;
12550 +       goto out_error;
12551 +
12552 +out_eagain:
12553 +       ret = -EAGAIN;
12554 +       goto out_error;
12555 +
12556 +out_efault:
12557 +       ret = -EFAULT;
12558 +       goto out_error;
12559 +
12560 +out_error:
12561 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12562 +       return ret;
12563  }
12564
12565  /*
12566 @@ -1094,6 +1187,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
12567
12568         /*
12569          * No existing pi state. First waiter. [2]
12570 +        *
12571 +        * This creates pi_state, we have hb->lock held, this means nothing can
12572 +        * observe this state, wait_lock is irrelevant.
12573          */
12574         pi_state = alloc_pi_state();
12575
12576 @@ -1118,17 +1214,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
12577         return 0;
12578  }
12579
12580 -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
12581 +static int lookup_pi_state(u32 __user *uaddr, u32 uval,
12582 +                          struct futex_hash_bucket *hb,
12583                            union futex_key *key, struct futex_pi_state **ps)
12584  {
12585 -       struct futex_q *match = futex_top_waiter(hb, key);
12586 +       struct futex_q *top_waiter = futex_top_waiter(hb, key);
12587
12588         /*
12589          * If there is a waiter on that futex, validate it and
12590          * attach to the pi_state when the validation succeeds.
12591          */
12592 -       if (match)
12593 -               return attach_to_pi_state(uval, match->pi_state, ps);
12594 +       if (top_waiter)
12595 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12596
12597         /*
12598          * We are the first waiter - try to look up the owner based on
12599 @@ -1147,7 +1244,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
12600         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
12601                 return -EFAULT;
12602
12603 -       /*If user space value changed, let the caller retry */
12604 +       /* If user space value changed, let the caller retry */
12605         return curval != uval ? -EAGAIN : 0;
12606  }
12607
12608 @@ -1175,7 +1272,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12609                                 struct task_struct *task, int set_waiters)
12610  {
12611         u32 uval, newval, vpid = task_pid_vnr(task);
12612 -       struct futex_q *match;
12613 +       struct futex_q *top_waiter;
12614         int ret;
12615
12616         /*
12617 @@ -1201,9 +1298,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12618          * Lookup existing state first. If it exists, try to attach to
12619          * its pi_state.
12620          */
12621 -       match = futex_top_waiter(hb, key);
12622 -       if (match)
12623 -               return attach_to_pi_state(uval, match->pi_state, ps);
12624 +       top_waiter = futex_top_waiter(hb, key);
12625 +       if (top_waiter)
12626 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12627
12628         /*
12629          * No waiter and user TID is 0. We are here because the
12630 @@ -1284,50 +1381,45 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
12631         wake_q_add(wake_q, p);
12632         __unqueue_futex(q);
12633         /*
12634 -        * The waiting task can free the futex_q as soon as
12635 -        * q->lock_ptr = NULL is written, without taking any locks. A
12636 -        * memory barrier is required here to prevent the following
12637 -        * store to lock_ptr from getting ahead of the plist_del.
12638 +        * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
12639 +        * is written, without taking any locks. This is possible in the event
12640 +        * of a spurious wakeup, for example. A memory barrier is required here
12641 +        * to prevent the following store to lock_ptr from getting ahead of the
12642 +        * plist_del in __unqueue_futex().
12643          */
12644 -       smp_wmb();
12645 -       q->lock_ptr = NULL;
12646 +       smp_store_release(&q->lock_ptr, NULL);
12647  }
12648
12649 -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12650 -                        struct futex_hash_bucket *hb)
12651 +/*
12652 + * Caller must hold a reference on @pi_state.
12653 + */
12654 +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
12655  {
12656 -       struct task_struct *new_owner;
12657 -       struct futex_pi_state *pi_state = this->pi_state;
12658         u32 uninitialized_var(curval), newval;
12659 +       struct task_struct *new_owner;
12660 +       bool postunlock = false;
12661         WAKE_Q(wake_q);
12662 -       bool deboost;
12663 +       WAKE_Q(wake_sleeper_q);
12664         int ret = 0;
12665
12666 -       if (!pi_state)
12667 -               return -EINVAL;
12668 -
12669 -       /*
12670 -        * If current does not own the pi_state then the futex is
12671 -        * inconsistent and user space fiddled with the futex value.
12672 -        */
12673 -       if (pi_state->owner != current)
12674 -               return -EINVAL;
12675 -
12676 -       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12677         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
12678 +       if (WARN_ON_ONCE(!new_owner)) {
12679 +               /*
12680 +                * As per the comment in futex_unlock_pi() this should not happen.
12681 +                *
12682 +                * When this happens, give up our locks and try again, giving
12683 +                * the futex_lock_pi() instance time to complete, either by
12684 +                * waiting on the rtmutex or removing itself from the futex
12685 +                * queue.
12686 +                */
12687 +               ret = -EAGAIN;
12688 +               goto out_unlock;
12689 +       }
12690
12691         /*
12692 -        * It is possible that the next waiter (the one that brought
12693 -        * this owner to the kernel) timed out and is no longer
12694 -        * waiting on the lock.
12695 -        */
12696 -       if (!new_owner)
12697 -               new_owner = this->task;
12698 -
12699 -       /*
12700 -        * We pass it to the next owner. The WAITERS bit is always
12701 -        * kept enabled while there is PI state around. We cleanup the
12702 -        * owner died bit, because we are the owner.
12703 +        * We pass it to the next owner. The WAITERS bit is always kept
12704 +        * enabled while there is PI state around. We cleanup the owner
12705 +        * died bit, because we are the owner.
12706          */
12707         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
12708
12709 @@ -1336,6 +1428,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12710
12711         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
12712                 ret = -EFAULT;
12713 +
12714         } else if (curval != uval) {
12715                 /*
12716                  * If a unconditional UNLOCK_PI operation (user space did not
12717 @@ -1348,10 +1441,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12718                 else
12719                         ret = -EINVAL;
12720         }
12721 -       if (ret) {
12722 -               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12723 -               return ret;
12724 -       }
12725 +
12726 +       if (ret)
12727 +               goto out_unlock;
12728 +
12729 +       /*
12730 +        * This is a point of no return; once we modify the uval there is no
12731 +        * going back and subsequent operations must not fail.
12732 +        */
12733
12734         raw_spin_lock(&pi_state->owner->pi_lock);
12735         WARN_ON(list_empty(&pi_state->list));
12736 @@ -1364,22 +1461,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12737         pi_state->owner = new_owner;
12738         raw_spin_unlock(&new_owner->pi_lock);
12739
12740 +       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
12741 +                                            &wake_sleeper_q);
12742 +out_unlock:
12743         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12744
12745 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
12746 +       if (postunlock)
12747 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
12748
12749 -       /*
12750 -        * First unlock HB so the waiter does not spin on it once he got woken
12751 -        * up. Second wake up the waiter before the priority is adjusted. If we
12752 -        * deboost first (and lose our higher priority), then the task might get
12753 -        * scheduled away before the wake up can take place.
12754 -        */
12755 -       spin_unlock(&hb->lock);
12756 -       wake_up_q(&wake_q);
12757 -       if (deboost)
12758 -               rt_mutex_adjust_prio(current);
12759 -
12760 -       return 0;
12761 +       return ret;
12762  }
12763
12764  /*
12765 @@ -1825,7 +1915,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12766                          * If that call succeeds then we have pi_state and an
12767                          * initial refcount on it.
12768                          */
12769 -                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
12770 +                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
12771                 }
12772
12773                 switch (ret) {
12774 @@ -1908,7 +1998,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12775                          * refcount on the pi_state and store the pointer in
12776                          * the futex_q object of the waiter.
12777                          */
12778 -                       atomic_inc(&pi_state->refcount);
12779 +                       get_pi_state(pi_state);
12780                         this->pi_state = pi_state;
12781                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
12782                                                         this->rt_waiter,
12783 @@ -1925,6 +2015,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12784                                 requeue_pi_wake_futex(this, &key2, hb2);
12785                                 drop_count++;
12786                                 continue;
12787 +                       } else if (ret == -EAGAIN) {
12788 +                               /*
12789 +                                * Waiter was woken by timeout or
12790 +                                * signal and has set pi_blocked_on to
12791 +                                * PI_WAKEUP_INPROGRESS before we
12792 +                                * tried to enqueue it on the rtmutex.
12793 +                                */
12794 +                               this->pi_state = NULL;
12795 +                               put_pi_state(pi_state);
12796 +                               continue;
12797                         } else if (ret) {
12798                                 /*
12799                                  * rt_mutex_start_proxy_lock() detected a
12800 @@ -2008,20 +2108,7 @@ queue_unlock(struct futex_hash_bucket *hb)
12801         hb_waiters_dec(hb);
12802  }
12803
12804 -/**
12805 - * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12806 - * @q: The futex_q to enqueue
12807 - * @hb:        The destination hash bucket
12808 - *
12809 - * The hb->lock must be held by the caller, and is released here. A call to
12810 - * queue_me() is typically paired with exactly one call to unqueue_me().  The
12811 - * exceptions involve the PI related operations, which may use unqueue_me_pi()
12812 - * or nothing if the unqueue is done as part of the wake process and the unqueue
12813 - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12814 - * an example).
12815 - */
12816 -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12817 -       __releases(&hb->lock)
12818 +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12819  {
12820         int prio;
12821
12822 @@ -2038,6 +2125,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12823         plist_node_init(&q->list, prio);
12824         plist_add(&q->list, &hb->chain);
12825         q->task = current;
12826 +}
12827 +
12828 +/**
12829 + * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12830 + * @q: The futex_q to enqueue
12831 + * @hb:        The destination hash bucket
12832 + *
12833 + * The hb->lock must be held by the caller, and is released here. A call to
12834 + * queue_me() is typically paired with exactly one call to unqueue_me().  The
12835 + * exceptions involve the PI related operations, which may use unqueue_me_pi()
12836 + * or nothing if the unqueue is done as part of the wake process and the unqueue
12837 + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12838 + * an example).
12839 + */
12840 +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12841 +       __releases(&hb->lock)
12842 +{
12843 +       __queue_me(q, hb);
12844         spin_unlock(&hb->lock);
12845  }
12846
12847 @@ -2124,10 +2229,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12848  {
12849         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
12850         struct futex_pi_state *pi_state = q->pi_state;
12851 -       struct task_struct *oldowner = pi_state->owner;
12852         u32 uval, uninitialized_var(curval), newval;
12853 +       struct task_struct *oldowner;
12854         int ret;
12855
12856 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12857 +
12858 +       oldowner = pi_state->owner;
12859         /* Owner died? */
12860         if (!pi_state->owner)
12861                 newtid |= FUTEX_OWNER_DIED;
12862 @@ -2135,7 +2243,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12863         /*
12864          * We are here either because we stole the rtmutex from the
12865          * previous highest priority waiter or we are the highest priority
12866 -        * waiter but failed to get the rtmutex the first time.
12867 +        * waiter but have failed to get the rtmutex the first time.
12868 +        *
12869          * We have to replace the newowner TID in the user space variable.
12870          * This must be atomic as we have to preserve the owner died bit here.
12871          *
12872 @@ -2143,17 +2252,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12873          * because we can fault here. Imagine swapped out pages or a fork
12874          * that marked all the anonymous memory readonly for cow.
12875          *
12876 -        * Modifying pi_state _before_ the user space value would
12877 -        * leave the pi_state in an inconsistent state when we fault
12878 -        * here, because we need to drop the hash bucket lock to
12879 -        * handle the fault. This might be observed in the PID check
12880 -        * in lookup_pi_state.
12881 +        * Modifying pi_state _before_ the user space value would leave the
12882 +        * pi_state in an inconsistent state when we fault here, because we
12883 +        * need to drop the locks to handle the fault. This might be observed
12884 +        * in the PID check in lookup_pi_state.
12885          */
12886  retry:
12887         if (get_futex_value_locked(&uval, uaddr))
12888                 goto handle_fault;
12889
12890 -       while (1) {
12891 +       for (;;) {
12892                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
12893
12894                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
12895 @@ -2168,47 +2276,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12896          * itself.
12897          */
12898         if (pi_state->owner != NULL) {
12899 -               raw_spin_lock_irq(&pi_state->owner->pi_lock);
12900 +               raw_spin_lock(&pi_state->owner->pi_lock);
12901                 WARN_ON(list_empty(&pi_state->list));
12902                 list_del_init(&pi_state->list);
12903 -               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
12904 +               raw_spin_unlock(&pi_state->owner->pi_lock);
12905         }
12906
12907         pi_state->owner = newowner;
12908
12909 -       raw_spin_lock_irq(&newowner->pi_lock);
12910 +       raw_spin_lock(&newowner->pi_lock);
12911         WARN_ON(!list_empty(&pi_state->list));
12912         list_add(&pi_state->list, &newowner->pi_state_list);
12913 -       raw_spin_unlock_irq(&newowner->pi_lock);
12914 +       raw_spin_unlock(&newowner->pi_lock);
12915 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12916 +
12917         return 0;
12918
12919         /*
12920 -        * To handle the page fault we need to drop the hash bucket
12921 -        * lock here. That gives the other task (either the highest priority
12922 -        * waiter itself or the task which stole the rtmutex) the
12923 -        * chance to try the fixup of the pi_state. So once we are
12924 -        * back from handling the fault we need to check the pi_state
12925 -        * after reacquiring the hash bucket lock and before trying to
12926 -        * do another fixup. When the fixup has been done already we
12927 -        * simply return.
12928 +        * To handle the page fault we need to drop the locks here. That gives
12929 +        * the other task (either the highest priority waiter itself or the
12930 +        * task which stole the rtmutex) the chance to try the fixup of the
12931 +        * pi_state. So once we are back from handling the fault we need to
12932 +        * check the pi_state after reacquiring the locks and before trying to
12933 +        * do another fixup. When the fixup has been done already we simply
12934 +        * return.
12935 +        *
12936 +        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
12937 +        * drop hb->lock since the caller owns the hb -> futex_q relation.
12938 +        * Dropping the pi_mutex->wait_lock requires the state revalidate.
12939          */
12940  handle_fault:
12941 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12942         spin_unlock(q->lock_ptr);
12943
12944         ret = fault_in_user_writeable(uaddr);
12945
12946         spin_lock(q->lock_ptr);
12947 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12948
12949         /*
12950          * Check if someone else fixed it for us:
12951          */
12952 -       if (pi_state->owner != oldowner)
12953 -               return 0;
12954 +       if (pi_state->owner != oldowner) {
12955 +               ret = 0;
12956 +               goto out_unlock;
12957 +       }
12958
12959         if (ret)
12960 -               return ret;
12961 +               goto out_unlock;
12962
12963         goto retry;
12964 +
12965 +out_unlock:
12966 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12967 +       return ret;
12968  }
12969
12970  static long futex_wait_restart(struct restart_block *restart);
12971 @@ -2230,13 +2351,16 @@ static long futex_wait_restart(struct restart_block *restart);
12972   */
12973  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12974  {
12975 -       struct task_struct *owner;
12976         int ret = 0;
12977
12978         if (locked) {
12979                 /*
12980                  * Got the lock. We might not be the anticipated owner if we
12981                  * did a lock-steal - fix up the PI-state in that case:
12982 +                *
12983 +                * We can safely read pi_state->owner without holding wait_lock
12984 +                * because we now own the rt_mutex, only the owner will attempt
12985 +                * to change it.
12986                  */
12987                 if (q->pi_state->owner != current)
12988                         ret = fixup_pi_state_owner(uaddr, q, current);
12989 @@ -2244,43 +2368,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12990         }
12991
12992         /*
12993 -        * Catch the rare case, where the lock was released when we were on the
12994 -        * way back before we locked the hash bucket.
12995 -        */
12996 -       if (q->pi_state->owner == current) {
12997 -               /*
12998 -                * Try to get the rt_mutex now. This might fail as some other
12999 -                * task acquired the rt_mutex after we removed ourself from the
13000 -                * rt_mutex waiters list.
13001 -                */
13002 -               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
13003 -                       locked = 1;
13004 -                       goto out;
13005 -               }
13006 -
13007 -               /*
13008 -                * pi_state is incorrect, some other task did a lock steal and
13009 -                * we returned due to timeout or signal without taking the
13010 -                * rt_mutex. Too late.
13011 -                */
13012 -               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
13013 -               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
13014 -               if (!owner)
13015 -                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
13016 -               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
13017 -               ret = fixup_pi_state_owner(uaddr, q, owner);
13018 -               goto out;
13019 -       }
13020 -
13021 -       /*
13022          * Paranoia check. If we did not take the lock, then we should not be
13023          * the owner of the rt_mutex.
13024          */
13025 -       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
13026 +       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
13027                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
13028                                 "pi-state %p\n", ret,
13029                                 q->pi_state->pi_mutex.owner,
13030                                 q->pi_state->owner);
13031 +       }
13032
13033  out:
13034         return ret ? ret : locked;
13035 @@ -2504,6 +2600,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13036                          ktime_t *time, int trylock)
13037  {
13038         struct hrtimer_sleeper timeout, *to = NULL;
13039 +       struct futex_pi_state *pi_state = NULL;
13040 +       struct rt_mutex_waiter rt_waiter;
13041         struct futex_hash_bucket *hb;
13042         struct futex_q q = futex_q_init;
13043         int res, ret;
13044 @@ -2556,25 +2654,77 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13045                 }
13046         }
13047
13048 +       WARN_ON(!q.pi_state);
13049 +
13050         /*
13051          * Only actually queue now that the atomic ops are done:
13052          */
13053 -       queue_me(&q, hb);
13054 +       __queue_me(&q, hb);
13055
13056 -       WARN_ON(!q.pi_state);
13057 -       /*
13058 -        * Block on the PI mutex:
13059 -        */
13060 -       if (!trylock) {
13061 -               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
13062 -       } else {
13063 -               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
13064 +       if (trylock) {
13065 +               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
13066                 /* Fixup the trylock return value: */
13067                 ret = ret ? 0 : -EWOULDBLOCK;
13068 +               goto no_block;
13069 +       }
13070 +
13071 +       rt_mutex_init_waiter(&rt_waiter, false);
13072 +
13073 +       /*
13074 +        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
13075 +        * hold it while doing rt_mutex_start_proxy(), because then it will
13076 +        * include hb->lock in the blocking chain, even through we'll not in
13077 +        * fact hold it while blocking. This will lead it to report -EDEADLK
13078 +        * and BUG when futex_unlock_pi() interleaves with this.
13079 +        *
13080 +        * Therefore acquire wait_lock while holding hb->lock, but drop the
13081 +        * latter before calling rt_mutex_start_proxy_lock(). This still fully
13082 +        * serializes against futex_unlock_pi() as that does the exact same
13083 +        * lock handoff sequence.
13084 +        */
13085 +       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
13086 +       /*
13087 +        * the migrate_disable() here disables migration in the in_atomic() fast
13088 +        * path which is enabled again in the following spin_unlock(). We have
13089 +        * one migrate_disable() pending in the slow-path which is reversed
13090 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
13091 +        */
13092 +       migrate_disable();
13093 +
13094 +       spin_unlock(q.lock_ptr);
13095 +       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
13096 +       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
13097 +       migrate_enable();
13098 +
13099 +       if (ret) {
13100 +               if (ret == 1)
13101 +                       ret = 0;
13102 +
13103 +               spin_lock(q.lock_ptr);
13104 +               goto no_block;
13105         }
13106
13107 +
13108 +       if (unlikely(to))
13109 +               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
13110 +
13111 +       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
13112 +
13113         spin_lock(q.lock_ptr);
13114         /*
13115 +        * If we failed to acquire the lock (signal/timeout), we must
13116 +        * first acquire the hb->lock before removing the lock from the
13117 +        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
13118 +        * wait lists consistent.
13119 +        *
13120 +        * In particular; it is important that futex_unlock_pi() can not
13121 +        * observe this inconsistency.
13122 +        */
13123 +       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
13124 +               ret = 0;
13125 +
13126 +no_block:
13127 +       /*
13128          * Fixup the pi_state owner and possibly acquire the lock if we
13129          * haven't already.
13130          */
13131 @@ -2590,12 +2740,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13132          * If fixup_owner() faulted and was unable to handle the fault, unlock
13133          * it and return the fault to userspace.
13134          */
13135 -       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
13136 -               rt_mutex_unlock(&q.pi_state->pi_mutex);
13137 +       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
13138 +               pi_state = q.pi_state;
13139 +               get_pi_state(pi_state);
13140 +       }
13141
13142         /* Unqueue and drop the lock */
13143         unqueue_me_pi(&q);
13144
13145 +       if (pi_state) {
13146 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
13147 +               put_pi_state(pi_state);
13148 +       }
13149 +
13150         goto out_put_key;
13151
13152  out_unlock_put_key:
13153 @@ -2604,8 +2761,10 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
13154  out_put_key:
13155         put_futex_key(&q.key);
13156  out:
13157 -       if (to)
13158 +       if (to) {
13159 +               hrtimer_cancel(&to->timer);
13160                 destroy_hrtimer_on_stack(&to->timer);
13161 +       }
13162         return ret != -EINTR ? ret : -ERESTARTNOINTR;
13163
13164  uaddr_faulted:
13165 @@ -2632,7 +2791,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13166         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
13167         union futex_key key = FUTEX_KEY_INIT;
13168         struct futex_hash_bucket *hb;
13169 -       struct futex_q *match;
13170 +       struct futex_q *top_waiter;
13171         int ret;
13172
13173  retry:
13174 @@ -2656,12 +2815,48 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13175          * all and we at least want to know if user space fiddled
13176          * with the futex value instead of blindly unlocking.
13177          */
13178 -       match = futex_top_waiter(hb, &key);
13179 -       if (match) {
13180 -               ret = wake_futex_pi(uaddr, uval, match, hb);
13181 +       top_waiter = futex_top_waiter(hb, &key);
13182 +       if (top_waiter) {
13183 +               struct futex_pi_state *pi_state = top_waiter->pi_state;
13184 +
13185 +               ret = -EINVAL;
13186 +               if (!pi_state)
13187 +                       goto out_unlock;
13188 +
13189 +               /*
13190 +                * If current does not own the pi_state then the futex is
13191 +                * inconsistent and user space fiddled with the futex value.
13192 +                */
13193 +               if (pi_state->owner != current)
13194 +                       goto out_unlock;
13195 +
13196 +               get_pi_state(pi_state);
13197 +               /*
13198 +                * By taking wait_lock while still holding hb->lock, we ensure
13199 +                * there is no point where we hold neither; and therefore
13200 +                * wake_futex_pi() must observe a state consistent with what we
13201 +                * observed.
13202 +                */
13203 +               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
13204 +               /*
13205 +                * Magic trickery for now to make the RT migrate disable
13206 +                * logic happy. The following spin_unlock() happens with
13207 +                * interrupts disabled so the internal migrate_enable()
13208 +                * won't undo the migrate_disable() which was issued when
13209 +                * locking hb->lock.
13210 +                */
13211 +               migrate_disable();
13212 +               spin_unlock(&hb->lock);
13213 +
13214 +               /* Drops pi_state->pi_mutex.wait_lock */
13215 +               ret = wake_futex_pi(uaddr, uval, pi_state);
13216 +
13217 +               migrate_enable();
13218 +
13219 +               put_pi_state(pi_state);
13220 +
13221                 /*
13222 -                * In case of success wake_futex_pi dropped the hash
13223 -                * bucket lock.
13224 +                * Success, we're done! No tricky corner cases.
13225                  */
13226                 if (!ret)
13227                         goto out_putkey;
13228 @@ -2676,7 +2871,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13229                  * setting the FUTEX_WAITERS bit. Try again.
13230                  */
13231                 if (ret == -EAGAIN) {
13232 -                       spin_unlock(&hb->lock);
13233                         put_futex_key(&key);
13234                         goto retry;
13235                 }
13236 @@ -2684,7 +2878,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13237                  * wake_futex_pi has detected invalid state. Tell user
13238                  * space.
13239                  */
13240 -               goto out_unlock;
13241 +               goto out_putkey;
13242         }
13243
13244         /*
13245 @@ -2694,8 +2888,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13246          * preserve the WAITERS bit not the OWNER_DIED one. We are the
13247          * owner.
13248          */
13249 -       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
13250 +       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
13251 +               spin_unlock(&hb->lock);
13252                 goto pi_faulted;
13253 +       }
13254
13255         /*
13256          * If uval has changed, let user space handle it.
13257 @@ -2709,7 +2905,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
13258         return ret;
13259
13260  pi_faulted:
13261 -       spin_unlock(&hb->lock);
13262         put_futex_key(&key);
13263
13264         ret = fault_in_user_writeable(uaddr);
13265 @@ -2813,8 +3008,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13266                                  u32 __user *uaddr2)
13267  {
13268         struct hrtimer_sleeper timeout, *to = NULL;
13269 +       struct futex_pi_state *pi_state = NULL;
13270         struct rt_mutex_waiter rt_waiter;
13271 -       struct futex_hash_bucket *hb;
13272 +       struct futex_hash_bucket *hb, *hb2;
13273         union futex_key key2 = FUTEX_KEY_INIT;
13274         struct futex_q q = futex_q_init;
13275         int res, ret;
13276 @@ -2839,10 +3035,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13277          * The waiter is allocated on our stack, manipulated by the requeue
13278          * code while we sleep on uaddr.
13279          */
13280 -       debug_rt_mutex_init_waiter(&rt_waiter);
13281 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
13282 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
13283 -       rt_waiter.task = NULL;
13284 +       rt_mutex_init_waiter(&rt_waiter, false);
13285
13286         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
13287         if (unlikely(ret != 0))
13288 @@ -2873,20 +3066,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13289         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
13290         futex_wait_queue_me(hb, &q, to);
13291
13292 -       spin_lock(&hb->lock);
13293 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
13294 -       spin_unlock(&hb->lock);
13295 -       if (ret)
13296 -               goto out_put_keys;
13297 +       /*
13298 +        * On RT we must avoid races with requeue and trying to block
13299 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
13300 +        * serializing access to pi_blocked_on with pi_lock.
13301 +        */
13302 +       raw_spin_lock_irq(&current->pi_lock);
13303 +       if (current->pi_blocked_on) {
13304 +               /*
13305 +                * We have been requeued or are in the process of
13306 +                * being requeued.
13307 +                */
13308 +               raw_spin_unlock_irq(&current->pi_lock);
13309 +       } else {
13310 +               /*
13311 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
13312 +                * prevents a concurrent requeue from moving us to the
13313 +                * uaddr2 rtmutex. After that we can safely acquire
13314 +                * (and possibly block on) hb->lock.
13315 +                */
13316 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
13317 +               raw_spin_unlock_irq(&current->pi_lock);
13318 +
13319 +               spin_lock(&hb->lock);
13320 +
13321 +               /*
13322 +                * Clean up pi_blocked_on. We might leak it otherwise
13323 +                * when we succeeded with the hb->lock in the fast
13324 +                * path.
13325 +                */
13326 +               raw_spin_lock_irq(&current->pi_lock);
13327 +               current->pi_blocked_on = NULL;
13328 +               raw_spin_unlock_irq(&current->pi_lock);
13329 +
13330 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
13331 +               spin_unlock(&hb->lock);
13332 +               if (ret)
13333 +                       goto out_put_keys;
13334 +       }
13335
13336         /*
13337 -        * In order for us to be here, we know our q.key == key2, and since
13338 -        * we took the hb->lock above, we also know that futex_requeue() has
13339 -        * completed and we no longer have to concern ourselves with a wakeup
13340 -        * race with the atomic proxy lock acquisition by the requeue code. The
13341 -        * futex_requeue dropped our key1 reference and incremented our key2
13342 -        * reference count.
13343 +        * In order to be here, we have either been requeued, are in
13344 +        * the process of being requeued, or requeue successfully
13345 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
13346 +        * non-null above, we may be racing with a requeue.  Do not
13347 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
13348 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
13349 +        * reference and incremented our key2 reference count.
13350          */
13351 +       hb2 = hash_futex(&key2);
13352
13353         /* Check if the requeue code acquired the second futex for us. */
13354         if (!q.rt_waiter) {
13355 @@ -2895,16 +3123,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13356                  * did a lock-steal - fix up the PI-state in that case.
13357                  */
13358                 if (q.pi_state && (q.pi_state->owner != current)) {
13359 -                       spin_lock(q.lock_ptr);
13360 +                       spin_lock(&hb2->lock);
13361 +                       BUG_ON(&hb2->lock != q.lock_ptr);
13362                         ret = fixup_pi_state_owner(uaddr2, &q, current);
13363 -                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
13364 -                               rt_mutex_unlock(&q.pi_state->pi_mutex);
13365 +                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
13366 +                               pi_state = q.pi_state;
13367 +                               get_pi_state(pi_state);
13368 +                       }
13369                         /*
13370                          * Drop the reference to the pi state which
13371                          * the requeue_pi() code acquired for us.
13372                          */
13373                         put_pi_state(q.pi_state);
13374 -                       spin_unlock(q.lock_ptr);
13375 +                       spin_unlock(&hb2->lock);
13376                 }
13377         } else {
13378                 struct rt_mutex *pi_mutex;
13379 @@ -2916,10 +3147,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13380                  */
13381                 WARN_ON(!q.pi_state);
13382                 pi_mutex = &q.pi_state->pi_mutex;
13383 -               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
13384 -               debug_rt_mutex_free_waiter(&rt_waiter);
13385 +               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
13386
13387 -               spin_lock(q.lock_ptr);
13388 +               spin_lock(&hb2->lock);
13389 +               BUG_ON(&hb2->lock != q.lock_ptr);
13390 +               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
13391 +                       ret = 0;
13392 +
13393 +               debug_rt_mutex_free_waiter(&rt_waiter);
13394                 /*
13395                  * Fixup the pi_state owner and possibly acquire the lock if we
13396                  * haven't already.
13397 @@ -2937,13 +3172,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13398                  * the fault, unlock the rt_mutex and return the fault to
13399                  * userspace.
13400                  */
13401 -               if (ret && rt_mutex_owner(pi_mutex) == current)
13402 -                       rt_mutex_unlock(pi_mutex);
13403 +               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
13404 +                       pi_state = q.pi_state;
13405 +                       get_pi_state(pi_state);
13406 +               }
13407
13408                 /* Unqueue and drop the lock. */
13409                 unqueue_me_pi(&q);
13410         }
13411
13412 +       if (pi_state) {
13413 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
13414 +               put_pi_state(pi_state);
13415 +       }
13416 +
13417         if (ret == -EINTR) {
13418                 /*
13419                  * We've already been requeued, but cannot restart by calling
13420 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
13421 index d3f24905852c..f87aa8fdcc51 100644
13422 --- a/kernel/irq/handle.c
13423 +++ b/kernel/irq/handle.c
13424 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
13425  {
13426         irqreturn_t retval;
13427         unsigned int flags = 0;
13428 +       struct pt_regs *regs = get_irq_regs();
13429 +       u64 ip = regs ? instruction_pointer(regs) : 0;
13430
13431         retval = __handle_irq_event_percpu(desc, &flags);
13432
13433 -       add_interrupt_randomness(desc->irq_data.irq, flags);
13434 +#ifdef CONFIG_PREEMPT_RT_FULL
13435 +       desc->random_ip = ip;
13436 +#else
13437 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
13438 +#endif
13439
13440         if (!noirqdebug)
13441                 note_interrupt(desc, retval);
13442 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
13443 index ea41820ab12e..5994867526f3 100644
13444 --- a/kernel/irq/manage.c
13445 +++ b/kernel/irq/manage.c
13446 @@ -22,6 +22,7 @@
13447  #include "internals.h"
13448
13449  #ifdef CONFIG_IRQ_FORCED_THREADING
13450 +# ifndef CONFIG_PREEMPT_RT_BASE
13451  __read_mostly bool force_irqthreads;
13452
13453  static int __init setup_forced_irqthreads(char *arg)
13454 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
13455         return 0;
13456  }
13457  early_param("threadirqs", setup_forced_irqthreads);
13458 +# endif
13459  #endif
13460
13461  static void __synchronize_hardirq(struct irq_desc *desc)
13462 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
13463
13464         if (desc->affinity_notify) {
13465                 kref_get(&desc->affinity_notify->kref);
13466 +
13467 +#ifdef CONFIG_PREEMPT_RT_BASE
13468 +               swork_queue(&desc->affinity_notify->swork);
13469 +#else
13470                 schedule_work(&desc->affinity_notify->work);
13471 +#endif
13472         }
13473         irqd_set(data, IRQD_AFFINITY_SET);
13474
13475 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
13476  }
13477  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
13478
13479 -static void irq_affinity_notify(struct work_struct *work)
13480 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
13481  {
13482 -       struct irq_affinity_notify *notify =
13483 -               container_of(work, struct irq_affinity_notify, work);
13484         struct irq_desc *desc = irq_to_desc(notify->irq);
13485         cpumask_var_t cpumask;
13486         unsigned long flags;
13487 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
13488         kref_put(&notify->kref, notify->release);
13489  }
13490
13491 +#ifdef CONFIG_PREEMPT_RT_BASE
13492 +static void init_helper_thread(void)
13493 +{
13494 +       static int init_sworker_once;
13495 +
13496 +       if (init_sworker_once)
13497 +               return;
13498 +       if (WARN_ON(swork_get()))
13499 +               return;
13500 +       init_sworker_once = 1;
13501 +}
13502 +
13503 +static void irq_affinity_notify(struct swork_event *swork)
13504 +{
13505 +       struct irq_affinity_notify *notify =
13506 +               container_of(swork, struct irq_affinity_notify, swork);
13507 +       _irq_affinity_notify(notify);
13508 +}
13509 +
13510 +#else
13511 +
13512 +static void irq_affinity_notify(struct work_struct *work)
13513 +{
13514 +       struct irq_affinity_notify *notify =
13515 +               container_of(work, struct irq_affinity_notify, work);
13516 +       _irq_affinity_notify(notify);
13517 +}
13518 +#endif
13519 +
13520  /**
13521   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
13522   *     @irq:           Interrupt for which to enable/disable notification
13523 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
13524         if (notify) {
13525                 notify->irq = irq;
13526                 kref_init(&notify->kref);
13527 +#ifdef CONFIG_PREEMPT_RT_BASE
13528 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
13529 +               init_helper_thread();
13530 +#else
13531                 INIT_WORK(&notify->work, irq_affinity_notify);
13532 +#endif
13533         }
13534
13535         raw_spin_lock_irqsave(&desc->lock, flags);
13536 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
13537         local_bh_disable();
13538         ret = action->thread_fn(action->irq, action->dev_id);
13539         irq_finalize_oneshot(desc, action);
13540 -       local_bh_enable();
13541 +       /*
13542 +        * Interrupts which have real time requirements can be set up
13543 +        * to avoid softirq processing in the thread handler. This is
13544 +        * safe as these interrupts do not raise soft interrupts.
13545 +        */
13546 +       if (irq_settings_no_softirq_call(desc))
13547 +               _local_bh_enable();
13548 +       else
13549 +               local_bh_enable();
13550         return ret;
13551  }
13552
13553 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
13554                 if (action_ret == IRQ_WAKE_THREAD)
13555                         irq_wake_secondary(desc, action);
13556
13557 +#ifdef CONFIG_PREEMPT_RT_FULL
13558 +               migrate_disable();
13559 +               add_interrupt_randomness(action->irq, 0,
13560 +                                desc->random_ip ^ (unsigned long) action);
13561 +               migrate_enable();
13562 +#endif
13563                 wake_threads_waitq(desc);
13564         }
13565
13566 @@ -1338,6 +1391,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
13567                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
13568                 }
13569
13570 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
13571 +                       irq_settings_set_no_softirq_call(desc);
13572 +
13573                 /* Set default affinity mask once everything is setup */
13574                 setup_affinity(desc, mask);
13575
13576 @@ -2063,7 +2119,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
13577   *     This call sets the internal irqchip state of an interrupt,
13578   *     depending on the value of @which.
13579   *
13580 - *     This function should be called with preemption disabled if the
13581 + *     This function should be called with migration disabled if the
13582   *     interrupt controller has per-cpu registers.
13583   */
13584  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
13585 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
13586 index 320579d89091..2df2d4445b1e 100644
13587 --- a/kernel/irq/settings.h
13588 +++ b/kernel/irq/settings.h
13589 @@ -16,6 +16,7 @@ enum {
13590         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
13591         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
13592         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
13593 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
13594         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
13595  };
13596
13597 @@ -30,6 +31,7 @@ enum {
13598  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
13599  #define IRQ_IS_POLLED          GOT_YOU_MORON
13600  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
13601 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
13602  #undef IRQF_MODIFY_MASK
13603  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
13604
13605 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
13606         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
13607  }
13608
13609 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
13610 +{
13611 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
13612 +}
13613 +
13614 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
13615 +{
13616 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
13617 +}
13618 +
13619  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
13620  {
13621         return desc->status_use_accessors & _IRQ_PER_CPU;
13622 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
13623 index 5707f97a3e6a..73f38dc7a7fb 100644
13624 --- a/kernel/irq/spurious.c
13625 +++ b/kernel/irq/spurious.c
13626 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
13627
13628  static int __init irqfixup_setup(char *str)
13629  {
13630 +#ifdef CONFIG_PREEMPT_RT_BASE
13631 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13632 +       return 1;
13633 +#endif
13634         irqfixup = 1;
13635         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
13636         printk(KERN_WARNING "This may impact system performance.\n");
13637 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
13638
13639  static int __init irqpoll_setup(char *str)
13640  {
13641 +#ifdef CONFIG_PREEMPT_RT_BASE
13642 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13643 +       return 1;
13644 +#endif
13645         irqfixup = 2;
13646         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
13647                                 "enabled\n");
13648 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
13649 index bcf107ce0854..2899ba0d23d1 100644
13650 --- a/kernel/irq_work.c
13651 +++ b/kernel/irq_work.c
13652 @@ -17,6 +17,7 @@
13653  #include <linux/cpu.h>
13654  #include <linux/notifier.h>
13655  #include <linux/smp.h>
13656 +#include <linux/interrupt.h>
13657  #include <asm/processor.h>
13658
13659
13660 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
13661   */
13662  bool irq_work_queue_on(struct irq_work *work, int cpu)
13663  {
13664 +       struct llist_head *list;
13665 +
13666         /* All work should have been flushed before going offline */
13667         WARN_ON_ONCE(cpu_is_offline(cpu));
13668
13669 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
13670         if (!irq_work_claim(work))
13671                 return false;
13672
13673 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
13674 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
13675 +               list = &per_cpu(lazy_list, cpu);
13676 +       else
13677 +               list = &per_cpu(raised_list, cpu);
13678 +
13679 +       if (llist_add(&work->llnode, list))
13680                 arch_send_call_function_single_ipi(cpu);
13681
13682         return true;
13683 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
13684  /* Enqueue the irq work @work on the current CPU */
13685  bool irq_work_queue(struct irq_work *work)
13686  {
13687 +       struct llist_head *list;
13688 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
13689 +
13690         /* Only queue if not already pending */
13691         if (!irq_work_claim(work))
13692                 return false;
13693 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
13694         /* Queue the entry and raise the IPI if needed. */
13695         preempt_disable();
13696
13697 -       /* If the work is "lazy", handle it from next tick if any */
13698 -       if (work->flags & IRQ_WORK_LAZY) {
13699 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
13700 -                   tick_nohz_tick_stopped())
13701 -                       arch_irq_work_raise();
13702 -       } else {
13703 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
13704 +       lazy_work = work->flags & IRQ_WORK_LAZY;
13705 +
13706 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
13707 +               list = this_cpu_ptr(&lazy_list);
13708 +       else
13709 +               list = this_cpu_ptr(&raised_list);
13710 +
13711 +       if (llist_add(&work->llnode, list)) {
13712 +               if (!lazy_work || tick_nohz_tick_stopped())
13713                         arch_irq_work_raise();
13714         }
13715
13716 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
13717         raised = this_cpu_ptr(&raised_list);
13718         lazy = this_cpu_ptr(&lazy_list);
13719
13720 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
13721 -               if (llist_empty(lazy))
13722 -                       return false;
13723 +       if (llist_empty(raised) && llist_empty(lazy))
13724 +               return false;
13725
13726         /* All work should have been flushed before going offline */
13727         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
13728 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
13729         struct irq_work *work;
13730         struct llist_node *llnode;
13731
13732 -       BUG_ON(!irqs_disabled());
13733 +       BUG_ON_NONRT(!irqs_disabled());
13734
13735         if (llist_empty(list))
13736                 return;
13737 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
13738  void irq_work_run(void)
13739  {
13740         irq_work_run_list(this_cpu_ptr(&raised_list));
13741 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
13742 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
13743 +               /*
13744 +                * NOTE: we raise softirq via IPI for safety,
13745 +                * and execute in irq_work_tick() to move the
13746 +                * overhead from hard to soft irq context.
13747 +                */
13748 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
13749 +                       raise_softirq(TIMER_SOFTIRQ);
13750 +       } else
13751 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13752  }
13753  EXPORT_SYMBOL_GPL(irq_work_run);
13754
13755 @@ -179,8 +200,17 @@ void irq_work_tick(void)
13756
13757         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
13758                 irq_work_run_list(raised);
13759 +
13760 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
13761 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13762 +}
13763 +
13764 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
13765 +void irq_work_tick_soft(void)
13766 +{
13767         irq_work_run_list(this_cpu_ptr(&lazy_list));
13768  }
13769 +#endif
13770
13771  /*
13772   * Synchronize against the irq_work @entry, ensures the entry is not
13773 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
13774 index ee1bc1bb8feb..ddef07958840 100644
13775 --- a/kernel/ksysfs.c
13776 +++ b/kernel/ksysfs.c
13777 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
13778
13779  #endif /* CONFIG_KEXEC_CORE */
13780
13781 +#if defined(CONFIG_PREEMPT_RT_FULL)
13782 +static ssize_t  realtime_show(struct kobject *kobj,
13783 +                             struct kobj_attribute *attr, char *buf)
13784 +{
13785 +       return sprintf(buf, "%d\n", 1);
13786 +}
13787 +KERNEL_ATTR_RO(realtime);
13788 +#endif
13789 +
13790  /* whether file capabilities are enabled */
13791  static ssize_t fscaps_show(struct kobject *kobj,
13792                                   struct kobj_attribute *attr, char *buf)
13793 @@ -225,6 +234,9 @@ static struct attribute * kernel_attrs[] = {
13794         &rcu_expedited_attr.attr,
13795         &rcu_normal_attr.attr,
13796  #endif
13797 +#ifdef CONFIG_PREEMPT_RT_FULL
13798 +       &realtime_attr.attr,
13799 +#endif
13800         NULL
13801  };
13802
13803 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
13804 index 6f88e352cd4f..6ff9e8011dd0 100644
13805 --- a/kernel/locking/Makefile
13806 +++ b/kernel/locking/Makefile
13807 @@ -2,7 +2,7 @@
13808  # and is generally not a function of system call inputs.
13809  KCOV_INSTRUMENT                := n
13810
13811 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
13812 +obj-y += semaphore.o percpu-rwsem.o
13813
13814  ifdef CONFIG_FUNCTION_TRACER
13815  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
13816 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
13817  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
13818  endif
13819
13820 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13821 +obj-y += mutex.o
13822  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
13823 +endif
13824 +obj-y += rwsem.o
13825  obj-$(CONFIG_LOCKDEP) += lockdep.o
13826  ifeq ($(CONFIG_PROC_FS),y)
13827  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
13828 @@ -24,7 +28,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
13829  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
13830  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
13831  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
13832 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13833  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
13834  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
13835 +endif
13836 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o rwsem-rt.o
13837  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
13838  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
13839 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
13840 index 6599c7f3071d..79f8e00e802e 100644
13841 --- a/kernel/locking/lockdep.c
13842 +++ b/kernel/locking/lockdep.c
13843 @@ -658,6 +658,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13844         struct lockdep_subclass_key *key;
13845         struct hlist_head *hash_head;
13846         struct lock_class *class;
13847 +       bool is_static = false;
13848
13849         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
13850                 debug_locks_off();
13851 @@ -671,10 +672,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13852
13853         /*
13854          * Static locks do not have their class-keys yet - for them the key
13855 -        * is the lock object itself:
13856 +        * is the lock object itself. If the lock is in the per cpu area,
13857 +        * the canonical address of the lock (per cpu offset removed) is
13858 +        * used.
13859          */
13860 -       if (unlikely(!lock->key))
13861 -               lock->key = (void *)lock;
13862 +       if (unlikely(!lock->key)) {
13863 +               unsigned long can_addr, addr = (unsigned long)lock;
13864 +
13865 +               if (__is_kernel_percpu_address(addr, &can_addr))
13866 +                       lock->key = (void *)can_addr;
13867 +               else if (__is_module_percpu_address(addr, &can_addr))
13868 +                       lock->key = (void *)can_addr;
13869 +               else if (static_obj(lock))
13870 +                       lock->key = (void *)lock;
13871 +               else
13872 +                       return ERR_PTR(-EINVAL);
13873 +               is_static = true;
13874 +       }
13875
13876         /*
13877          * NOTE: the class-key must be unique. For dynamic locks, a static
13878 @@ -706,7 +720,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13879                 }
13880         }
13881
13882 -       return NULL;
13883 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
13884  }
13885
13886  /*
13887 @@ -724,19 +738,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
13888         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
13889
13890         class = look_up_lock_class(lock, subclass);
13891 -       if (likely(class))
13892 +       if (likely(!IS_ERR_OR_NULL(class)))
13893                 goto out_set_class_cache;
13894
13895         /*
13896          * Debug-check: all keys must be persistent!
13897 -        */
13898 -       if (!static_obj(lock->key)) {
13899 +        */
13900 +       if (IS_ERR(class)) {
13901                 debug_locks_off();
13902                 printk("INFO: trying to register non-static key.\n");
13903                 printk("the code is fine but needs lockdep annotation.\n");
13904                 printk("turning off the locking correctness validator.\n");
13905                 dump_stack();
13906 -
13907                 return NULL;
13908         }
13909
13910 @@ -3417,7 +3430,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
13911                  * Clearly if the lock hasn't been acquired _ever_, we're not
13912                  * holding it either, so report failure.
13913                  */
13914 -               if (!class)
13915 +               if (IS_ERR_OR_NULL(class))
13916                         return 0;
13917
13918                 /*
13919 @@ -3696,6 +3709,7 @@ static void check_flags(unsigned long flags)
13920                 }
13921         }
13922
13923 +#ifndef CONFIG_PREEMPT_RT_FULL
13924         /*
13925          * We dont accurately track softirq state in e.g.
13926          * hardirq contexts (such as on 4KSTACKS), so only
13927 @@ -3710,6 +3724,7 @@ static void check_flags(unsigned long flags)
13928                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
13929                 }
13930         }
13931 +#endif
13932
13933         if (!debug_locks)
13934                 print_irqtrace_events(current);
13935 @@ -4166,7 +4181,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
13936                  * If the class exists we look it up and zap it:
13937                  */
13938                 class = look_up_lock_class(lock, j);
13939 -               if (class)
13940 +               if (!IS_ERR_OR_NULL(class))
13941                         zap_class(class);
13942         }
13943         /*
13944 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
13945 index d3de04b12f8c..0f49abeae337 100644
13946 --- a/kernel/locking/locktorture.c
13947 +++ b/kernel/locking/locktorture.c
13948 @@ -26,7 +26,6 @@
13949  #include <linux/kthread.h>
13950  #include <linux/sched/rt.h>
13951  #include <linux/spinlock.h>
13952 -#include <linux/rwlock.h>
13953  #include <linux/mutex.h>
13954  #include <linux/rwsem.h>
13955  #include <linux/smp.h>
13956 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
13957 index ce182599cf2e..2ad3a1e8344c 100644
13958 --- a/kernel/locking/percpu-rwsem.c
13959 +++ b/kernel/locking/percpu-rwsem.c
13960 @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
13961         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
13962         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
13963         __init_rwsem(&sem->rw_sem, name, rwsem_key);
13964 -       init_waitqueue_head(&sem->writer);
13965 +       init_swait_queue_head(&sem->writer);
13966         sem->readers_block = 0;
13967         return 0;
13968  }
13969 @@ -103,7 +103,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
13970         __this_cpu_dec(*sem->read_count);
13971
13972         /* Prod writer to recheck readers_active */
13973 -       wake_up(&sem->writer);
13974 +       swake_up(&sem->writer);
13975  }
13976  EXPORT_SYMBOL_GPL(__percpu_up_read);
13977
13978 @@ -160,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
13979          */
13980
13981         /* Wait for all now active readers to complete. */
13982 -       wait_event(sem->writer, readers_active_check(sem));
13983 +       swait_event(sem->writer, readers_active_check(sem));
13984  }
13985  EXPORT_SYMBOL_GPL(percpu_down_write);
13986
13987 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
13988 new file mode 100644
13989 index 000000000000..6284e3b15091
13990 --- /dev/null
13991 +++ b/kernel/locking/rt.c
13992 @@ -0,0 +1,331 @@
13993 +/*
13994 + * kernel/rt.c
13995 + *
13996 + * Real-Time Preemption Support
13997 + *
13998 + * started by Ingo Molnar:
13999 + *
14000 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
14001 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
14002 + *
14003 + * historic credit for proving that Linux spinlocks can be implemented via
14004 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
14005 + * and others) who prototyped it on 2.4 and did lots of comparative
14006 + * research and analysis; TimeSys, for proving that you can implement a
14007 + * fully preemptible kernel via the use of IRQ threading and mutexes;
14008 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
14009 + * right one; and to MontaVista, who ported pmutexes to 2.6.
14010 + *
14011 + * This code is a from-scratch implementation and is not based on pmutexes,
14012 + * but the idea of converting spinlocks to mutexes is used here too.
14013 + *
14014 + * lock debugging, locking tree, deadlock detection:
14015 + *
14016 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
14017 + *  Released under the General Public License (GPL).
14018 + *
14019 + * Includes portions of the generic R/W semaphore implementation from:
14020 + *
14021 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
14022 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
14023 + *  - Derived also from comments by Linus
14024 + *
14025 + * Pending ownership of locks and ownership stealing:
14026 + *
14027 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
14028 + *
14029 + *   (also by Steven Rostedt)
14030 + *    - Converted single pi_lock to individual task locks.
14031 + *
14032 + * By Esben Nielsen:
14033 + *    Doing priority inheritance with help of the scheduler.
14034 + *
14035 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
14036 + *  - major rework based on Esben Nielsens initial patch
14037 + *  - replaced thread_info references by task_struct refs
14038 + *  - removed task->pending_owner dependency
14039 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
14040 + *    in the scheduler return path as discussed with Steven Rostedt
14041 + *
14042 + *  Copyright (C) 2006, Kihon Technologies Inc.
14043 + *    Steven Rostedt <rostedt@goodmis.org>
14044 + *  - debugged and patched Thomas Gleixner's rework.
14045 + *  - added back the cmpxchg to the rework.
14046 + *  - turned atomic require back on for SMP.
14047 + */
14048 +
14049 +#include <linux/spinlock.h>
14050 +#include <linux/rtmutex.h>
14051 +#include <linux/sched.h>
14052 +#include <linux/delay.h>
14053 +#include <linux/module.h>
14054 +#include <linux/kallsyms.h>
14055 +#include <linux/syscalls.h>
14056 +#include <linux/interrupt.h>
14057 +#include <linux/plist.h>
14058 +#include <linux/fs.h>
14059 +#include <linux/futex.h>
14060 +#include <linux/hrtimer.h>
14061 +
14062 +#include "rtmutex_common.h"
14063 +
14064 +/*
14065 + * struct mutex functions
14066 + */
14067 +void __mutex_do_init(struct mutex *mutex, const char *name,
14068 +                    struct lock_class_key *key)
14069 +{
14070 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14071 +       /*
14072 +        * Make sure we are not reinitializing a held lock:
14073 +        */
14074 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
14075 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
14076 +#endif
14077 +       mutex->lock.save_state = 0;
14078 +}
14079 +EXPORT_SYMBOL(__mutex_do_init);
14080 +
14081 +void __lockfunc _mutex_lock(struct mutex *lock)
14082 +{
14083 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14084 +       rt_mutex_lock(&lock->lock);
14085 +}
14086 +EXPORT_SYMBOL(_mutex_lock);
14087 +
14088 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
14089 +{
14090 +       int ret;
14091 +
14092 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14093 +       ret = rt_mutex_lock_interruptible(&lock->lock);
14094 +       if (ret)
14095 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14096 +       return ret;
14097 +}
14098 +EXPORT_SYMBOL(_mutex_lock_interruptible);
14099 +
14100 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
14101 +{
14102 +       int ret;
14103 +
14104 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14105 +       ret = rt_mutex_lock_killable(&lock->lock);
14106 +       if (ret)
14107 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14108 +       return ret;
14109 +}
14110 +EXPORT_SYMBOL(_mutex_lock_killable);
14111 +
14112 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14113 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
14114 +{
14115 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
14116 +       rt_mutex_lock(&lock->lock);
14117 +}
14118 +EXPORT_SYMBOL(_mutex_lock_nested);
14119 +
14120 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
14121 +{
14122 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
14123 +       rt_mutex_lock(&lock->lock);
14124 +}
14125 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
14126 +
14127 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
14128 +{
14129 +       int ret;
14130 +
14131 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
14132 +       ret = rt_mutex_lock_interruptible(&lock->lock);
14133 +       if (ret)
14134 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14135 +       return ret;
14136 +}
14137 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
14138 +
14139 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
14140 +{
14141 +       int ret;
14142 +
14143 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
14144 +       ret = rt_mutex_lock_killable(&lock->lock);
14145 +       if (ret)
14146 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
14147 +       return ret;
14148 +}
14149 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
14150 +#endif
14151 +
14152 +int __lockfunc _mutex_trylock(struct mutex *lock)
14153 +{
14154 +       int ret = rt_mutex_trylock(&lock->lock);
14155 +
14156 +       if (ret)
14157 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14158 +
14159 +       return ret;
14160 +}
14161 +EXPORT_SYMBOL(_mutex_trylock);
14162 +
14163 +void __lockfunc _mutex_unlock(struct mutex *lock)
14164 +{
14165 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
14166 +       rt_mutex_unlock(&lock->lock);
14167 +}
14168 +EXPORT_SYMBOL(_mutex_unlock);
14169 +
14170 +/*
14171 + * rwlock_t functions
14172 + */
14173 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
14174 +{
14175 +       int ret;
14176 +
14177 +       migrate_disable();
14178 +       ret = rt_mutex_trylock(&rwlock->lock);
14179 +       if (ret)
14180 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
14181 +       else
14182 +               migrate_enable();
14183 +
14184 +       return ret;
14185 +}
14186 +EXPORT_SYMBOL(rt_write_trylock);
14187 +
14188 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
14189 +{
14190 +       int ret;
14191 +
14192 +       *flags = 0;
14193 +       ret = rt_write_trylock(rwlock);
14194 +       return ret;
14195 +}
14196 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
14197 +
14198 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
14199 +{
14200 +       struct rt_mutex *lock = &rwlock->lock;
14201 +       int ret = 1;
14202 +
14203 +       /*
14204 +        * recursive read locks succeed when current owns the lock,
14205 +        * but not when read_depth == 0 which means that the lock is
14206 +        * write locked.
14207 +        */
14208 +       if (rt_mutex_owner(lock) != current) {
14209 +               migrate_disable();
14210 +               ret = rt_mutex_trylock(lock);
14211 +               if (ret)
14212 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
14213 +               else
14214 +                       migrate_enable();
14215 +
14216 +       } else if (!rwlock->read_depth) {
14217 +               ret = 0;
14218 +       }
14219 +
14220 +       if (ret)
14221 +               rwlock->read_depth++;
14222 +
14223 +       return ret;
14224 +}
14225 +EXPORT_SYMBOL(rt_read_trylock);
14226 +
14227 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
14228 +{
14229 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
14230 +       __rt_spin_lock(&rwlock->lock);
14231 +}
14232 +EXPORT_SYMBOL(rt_write_lock);
14233 +
14234 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
14235 +{
14236 +       struct rt_mutex *lock = &rwlock->lock;
14237 +
14238 +
14239 +       /*
14240 +        * recursive read locks succeed when current owns the lock
14241 +        */
14242 +       if (rt_mutex_owner(lock) != current) {
14243 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
14244 +               __rt_spin_lock(lock);
14245 +       }
14246 +       rwlock->read_depth++;
14247 +}
14248 +
14249 +EXPORT_SYMBOL(rt_read_lock);
14250 +
14251 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
14252 +{
14253 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14254 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
14255 +       __rt_spin_unlock(&rwlock->lock);
14256 +       migrate_enable();
14257 +}
14258 +EXPORT_SYMBOL(rt_write_unlock);
14259 +
14260 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
14261 +{
14262 +       /* Release the lock only when read_depth is down to 0 */
14263 +       if (--rwlock->read_depth == 0) {
14264 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
14265 +               __rt_spin_unlock(&rwlock->lock);
14266 +               migrate_enable();
14267 +       }
14268 +}
14269 +EXPORT_SYMBOL(rt_read_unlock);
14270 +
14271 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
14272 +{
14273 +       rt_write_lock(rwlock);
14274 +
14275 +       return 0;
14276 +}
14277 +EXPORT_SYMBOL(rt_write_lock_irqsave);
14278 +
14279 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
14280 +{
14281 +       rt_read_lock(rwlock);
14282 +
14283 +       return 0;
14284 +}
14285 +EXPORT_SYMBOL(rt_read_lock_irqsave);
14286 +
14287 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
14288 +{
14289 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14290 +       /*
14291 +        * Make sure we are not reinitializing a held lock:
14292 +        */
14293 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
14294 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
14295 +#endif
14296 +       rwlock->lock.save_state = 1;
14297 +       rwlock->read_depth = 0;
14298 +}
14299 +EXPORT_SYMBOL(__rt_rwlock_init);
14300 +
14301 +/**
14302 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
14303 + * @cnt: the atomic which we are to dec
14304 + * @lock: the mutex to return holding if we dec to 0
14305 + *
14306 + * return true and hold lock if we dec to 0, return false otherwise
14307 + */
14308 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
14309 +{
14310 +       /* dec if we can't possibly hit 0 */
14311 +       if (atomic_add_unless(cnt, -1, 1))
14312 +               return 0;
14313 +       /* we might hit 0, so take the lock */
14314 +       mutex_lock(lock);
14315 +       if (!atomic_dec_and_test(cnt)) {
14316 +               /* when we actually did the dec, we didn't hit 0 */
14317 +               mutex_unlock(lock);
14318 +               return 0;
14319 +       }
14320 +       /* we hit 0, and we hold the lock */
14321 +       return 1;
14322 +}
14323 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
14324 diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
14325 index 62b6cee8ea7f..0613c4b1d059 100644
14326 --- a/kernel/locking/rtmutex-debug.c
14327 +++ b/kernel/locking/rtmutex-debug.c
14328 @@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
14329         lock->name = name;
14330  }
14331
14332 -void
14333 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
14334 -{
14335 -}
14336 -
14337 -void rt_mutex_deadlock_account_unlock(struct task_struct *task)
14338 -{
14339 -}
14340 -
14341 diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
14342 index d0519c3432b6..b585af9a1b50 100644
14343 --- a/kernel/locking/rtmutex-debug.h
14344 +++ b/kernel/locking/rtmutex-debug.h
14345 @@ -9,9 +9,6 @@
14346   * This file contains macros used solely by rtmutex.c. Debug version.
14347   */
14348
14349 -extern void
14350 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
14351 -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
14352  extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
14353  extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
14354  extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
14355 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
14356 index 2c49d76f96c3..3a8b5d44aaf8 100644
14357 --- a/kernel/locking/rtmutex.c
14358 +++ b/kernel/locking/rtmutex.c
14359 @@ -7,6 +7,11 @@
14360   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
14361   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
14362   *  Copyright (C) 2006 Esben Nielsen
14363 + *  Adaptive Spinlocks:
14364 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
14365 + *                                  and Peter Morreale,
14366 + * Adaptive Spinlocks simplification:
14367 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
14368   *
14369   *  See Documentation/locking/rt-mutex-design.txt for details.
14370   */
14371 @@ -16,6 +21,8 @@
14372  #include <linux/sched/rt.h>
14373  #include <linux/sched/deadline.h>
14374  #include <linux/timer.h>
14375 +#include <linux/ww_mutex.h>
14376 +#include <linux/blkdev.h>
14377
14378  #include "rtmutex_common.h"
14379
14380 @@ -133,6 +140,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
14381                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
14382  }
14383
14384 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
14385 +{
14386 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
14387 +               waiter != PI_REQUEUE_INPROGRESS;
14388 +}
14389 +
14390  /*
14391   * We can speed up the acquire/release, if there's no debugging state to be
14392   * set up.
14393 @@ -222,6 +235,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
14394  }
14395  #endif
14396
14397 +/*
14398 + * Only use with rt_mutex_waiter_{less,equal}()
14399 + */
14400 +#define task_to_waiter(p) &(struct rt_mutex_waiter) \
14401 +       { .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) }
14402 +
14403  static inline int
14404  rt_mutex_waiter_less(struct rt_mutex_waiter *left,
14405                      struct rt_mutex_waiter *right)
14406 @@ -236,12 +255,51 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
14407          * then right waiter has a dl_prio() too.
14408          */
14409         if (dl_prio(left->prio))
14410 -               return dl_time_before(left->task->dl.deadline,
14411 -                                     right->task->dl.deadline);
14412 +               return dl_time_before(left->deadline, right->deadline);
14413
14414         return 0;
14415  }
14416
14417 +static inline int
14418 +rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
14419 +                     struct rt_mutex_waiter *right)
14420 +{
14421 +       if (left->prio != right->prio)
14422 +               return 0;
14423 +
14424 +       /*
14425 +        * If both waiters have dl_prio(), we check the deadlines of the
14426 +        * associated tasks.
14427 +        * If left waiter has a dl_prio(), and we didn't return 0 above,
14428 +        * then right waiter has a dl_prio() too.
14429 +        */
14430 +       if (dl_prio(left->prio))
14431 +               return left->deadline == right->deadline;
14432 +
14433 +       return 1;
14434 +}
14435 +
14436 +#define STEAL_NORMAL  0
14437 +#define STEAL_LATERAL 1
14438 +
14439 +static inline int
14440 +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode)
14441 +{
14442 +       struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
14443 +
14444 +       if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter))
14445 +               return 1;
14446 +
14447 +       /*
14448 +        * Note that RT tasks are excluded from lateral-steals
14449 +        * to prevent the introduction of an unbounded latency.
14450 +        */
14451 +       if (mode == STEAL_NORMAL || rt_task(waiter->task))
14452 +               return 0;
14453 +
14454 +       return rt_mutex_waiter_equal(waiter, top_waiter);
14455 +}
14456 +
14457  static void
14458  rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
14459  {
14460 @@ -320,72 +378,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
14461         RB_CLEAR_NODE(&waiter->pi_tree_entry);
14462  }
14463
14464 -/*
14465 - * Calculate task priority from the waiter tree priority
14466 - *
14467 - * Return task->normal_prio when the waiter tree is empty or when
14468 - * the waiter is not allowed to do priority boosting
14469 - */
14470 -int rt_mutex_getprio(struct task_struct *task)
14471 -{
14472 -       if (likely(!task_has_pi_waiters(task)))
14473 -               return task->normal_prio;
14474 -
14475 -       return min(task_top_pi_waiter(task)->prio,
14476 -                  task->normal_prio);
14477 -}
14478 -
14479 -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
14480 -{
14481 -       if (likely(!task_has_pi_waiters(task)))
14482 -               return NULL;
14483 -
14484 -       return task_top_pi_waiter(task)->task;
14485 -}
14486 -
14487 -/*
14488 - * Called by sched_setscheduler() to get the priority which will be
14489 - * effective after the change.
14490 - */
14491 -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
14492 -{
14493 -       if (!task_has_pi_waiters(task))
14494 -               return newprio;
14495 -
14496 -       if (task_top_pi_waiter(task)->task->prio <= newprio)
14497 -               return task_top_pi_waiter(task)->task->prio;
14498 -       return newprio;
14499 -}
14500 -
14501 -/*
14502 - * Adjust the priority of a task, after its pi_waiters got modified.
14503 - *
14504 - * This can be both boosting and unboosting. task->pi_lock must be held.
14505 - */
14506 -static void __rt_mutex_adjust_prio(struct task_struct *task)
14507 +static void rt_mutex_adjust_prio(struct task_struct *p)
14508  {
14509 -       int prio = rt_mutex_getprio(task);
14510 +       struct task_struct *pi_task = NULL;
14511
14512 -       if (task->prio != prio || dl_prio(prio))
14513 -               rt_mutex_setprio(task, prio);
14514 -}
14515 +       lockdep_assert_held(&p->pi_lock);
14516
14517 -/*
14518 - * Adjust task priority (undo boosting). Called from the exit path of
14519 - * rt_mutex_slowunlock() and rt_mutex_slowlock().
14520 - *
14521 - * (Note: We do this outside of the protection of lock->wait_lock to
14522 - * allow the lock to be taken while or before we readjust the priority
14523 - * of task. We do not use the spin_xx_mutex() variants here as we are
14524 - * outside of the debug path.)
14525 - */
14526 -void rt_mutex_adjust_prio(struct task_struct *task)
14527 -{
14528 -       unsigned long flags;
14529 +       if (task_has_pi_waiters(p))
14530 +               pi_task = task_top_pi_waiter(p)->task;
14531
14532 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
14533 -       __rt_mutex_adjust_prio(task);
14534 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14535 +       rt_mutex_setprio(p, pi_task);
14536  }
14537
14538  /*
14539 @@ -414,6 +416,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
14540         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
14541  }
14542
14543 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
14544 +{
14545 +       if (waiter->savestate)
14546 +               wake_up_lock_sleeper(waiter->task);
14547 +       else
14548 +               wake_up_process(waiter->task);
14549 +}
14550 +
14551  /*
14552   * Max number of times we'll walk the boosting chain:
14553   */
14554 @@ -421,7 +431,8 @@ int max_lock_depth = 1024;
14555
14556  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
14557  {
14558 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
14559 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
14560 +               p->pi_blocked_on->lock : NULL;
14561  }
14562
14563  /*
14564 @@ -557,7 +568,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14565          * reached or the state of the chain has changed while we
14566          * dropped the locks.
14567          */
14568 -       if (!waiter)
14569 +       if (!rt_mutex_real_waiter(waiter))
14570                 goto out_unlock_pi;
14571
14572         /*
14573 @@ -608,7 +619,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14574          * enabled we continue, but stop the requeueing in the chain
14575          * walk.
14576          */
14577 -       if (waiter->prio == task->prio) {
14578 +       if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
14579                 if (!detect_deadlock)
14580                         goto out_unlock_pi;
14581                 else
14582 @@ -704,7 +715,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14583
14584         /* [7] Requeue the waiter in the lock waiter tree. */
14585         rt_mutex_dequeue(lock, waiter);
14586 +
14587 +       /*
14588 +        * Update the waiter prio fields now that we're dequeued.
14589 +        *
14590 +        * These values can have changed through either:
14591 +        *
14592 +        *   sys_sched_set_scheduler() / sys_sched_setattr()
14593 +        *
14594 +        * or
14595 +        *
14596 +        *   DL CBS enforcement advancing the effective deadline.
14597 +        *
14598 +        * Even though pi_waiters also uses these fields, and that tree is only
14599 +        * updated in [11], we can do this here, since we hold [L], which
14600 +        * serializes all pi_waiters access and rb_erase() does not care about
14601 +        * the values of the node being removed.
14602 +        */
14603         waiter->prio = task->prio;
14604 +       waiter->deadline = task->dl.deadline;
14605 +
14606         rt_mutex_enqueue(lock, waiter);
14607
14608         /* [8] Release the task */
14609 @@ -719,13 +749,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14610          * follow here. This is the end of the chain we are walking.
14611          */
14612         if (!rt_mutex_owner(lock)) {
14613 +               struct rt_mutex_waiter *lock_top_waiter;
14614 +
14615                 /*
14616                  * If the requeue [7] above changed the top waiter,
14617                  * then we need to wake the new top waiter up to try
14618                  * to get the lock.
14619                  */
14620 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
14621 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
14622 +               lock_top_waiter = rt_mutex_top_waiter(lock);
14623 +               if (prerequeue_top_waiter != lock_top_waiter)
14624 +                       rt_mutex_wake_waiter(lock_top_waiter);
14625                 raw_spin_unlock_irq(&lock->wait_lock);
14626                 return 0;
14627         }
14628 @@ -745,7 +778,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14629                  */
14630                 rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
14631                 rt_mutex_enqueue_pi(task, waiter);
14632 -               __rt_mutex_adjust_prio(task);
14633 +               rt_mutex_adjust_prio(task);
14634
14635         } else if (prerequeue_top_waiter == waiter) {
14636                 /*
14637 @@ -761,7 +794,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14638                 rt_mutex_dequeue_pi(task, waiter);
14639                 waiter = rt_mutex_top_waiter(lock);
14640                 rt_mutex_enqueue_pi(task, waiter);
14641 -               __rt_mutex_adjust_prio(task);
14642 +               rt_mutex_adjust_prio(task);
14643         } else {
14644                 /*
14645                  * Nothing changed. No need to do any priority
14646 @@ -818,6 +851,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14647         return ret;
14648  }
14649
14650 +
14651  /*
14652   * Try to take an rt-mutex
14653   *
14654 @@ -827,10 +861,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14655   * @task:   The task which wants to acquire the lock
14656   * @waiter: The waiter that is queued to the lock's wait tree if the
14657   *         callsite called task_blocked_on_lock(), otherwise NULL
14658 + * @mode:   Lock steal mode (STEAL_NORMAL, STEAL_LATERAL)
14659   */
14660 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14661 -                               struct rt_mutex_waiter *waiter)
14662 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
14663 +                                 struct task_struct *task,
14664 +                                 struct rt_mutex_waiter *waiter, int mode)
14665  {
14666 +       lockdep_assert_held(&lock->wait_lock);
14667 +
14668         /*
14669          * Before testing whether we can acquire @lock, we set the
14670          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
14671 @@ -863,12 +901,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14672          */
14673         if (waiter) {
14674                 /*
14675 -                * If waiter is not the highest priority waiter of
14676 -                * @lock, give up.
14677 +                * If waiter is not the highest priority waiter of @lock,
14678 +                * or its peer when lateral steal is allowed, give up.
14679                  */
14680 -               if (waiter != rt_mutex_top_waiter(lock))
14681 +               if (!rt_mutex_steal(lock, waiter, mode))
14682                         return 0;
14683 -
14684                 /*
14685                  * We can acquire the lock. Remove the waiter from the
14686                  * lock waiters tree.
14687 @@ -886,13 +923,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14688                  */
14689                 if (rt_mutex_has_waiters(lock)) {
14690                         /*
14691 -                        * If @task->prio is greater than or equal to
14692 -                        * the top waiter priority (kernel view),
14693 -                        * @task lost.
14694 +                        * If @task->prio is greater than the top waiter
14695 +                        * priority (kernel view), or equal to it when a
14696 +                        * lateral steal is forbidden, @task lost.
14697                          */
14698 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
14699 +                       if (!rt_mutex_steal(lock, task_to_waiter(task), mode))
14700                                 return 0;
14701 -
14702                         /*
14703                          * The current top waiter stays enqueued. We
14704                          * don't have to change anything in the lock
14705 @@ -936,177 +972,589 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14706          */
14707         rt_mutex_set_owner(lock, task);
14708
14709 -       rt_mutex_deadlock_account_lock(lock, task);
14710 -
14711         return 1;
14712  }
14713
14714 +#ifdef CONFIG_PREEMPT_RT_FULL
14715  /*
14716 - * Task blocks on lock.
14717 - *
14718 - * Prepare waiter and propagate pi chain
14719 - *
14720 - * This must be called with lock->wait_lock held and interrupts disabled
14721 + * preemptible spin_lock functions:
14722   */
14723 -static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14724 -                                  struct rt_mutex_waiter *waiter,
14725 -                                  struct task_struct *task,
14726 -                                  enum rtmutex_chainwalk chwalk)
14727 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
14728 +                                        void  (*slowfn)(struct rt_mutex *lock,
14729 +                                                        bool mg_off),
14730 +                                        bool do_mig_dis)
14731  {
14732 -       struct task_struct *owner = rt_mutex_owner(lock);
14733 -       struct rt_mutex_waiter *top_waiter = waiter;
14734 -       struct rt_mutex *next_lock;
14735 -       int chain_walk = 0, res;
14736 +       might_sleep_no_state_check();
14737
14738 -       /*
14739 -        * Early deadlock detection. We really don't want the task to
14740 -        * enqueue on itself just to untangle the mess later. It's not
14741 -        * only an optimization. We drop the locks, so another waiter
14742 -        * can come in before the chain walk detects the deadlock. So
14743 -        * the other will detect the deadlock and return -EDEADLOCK,
14744 -        * which is wrong, as the other waiter is not in a deadlock
14745 -        * situation.
14746 -        */
14747 -       if (owner == task)
14748 -               return -EDEADLK;
14749 +       if (do_mig_dis)
14750 +               migrate_disable();
14751
14752 -       raw_spin_lock(&task->pi_lock);
14753 -       __rt_mutex_adjust_prio(task);
14754 -       waiter->task = task;
14755 -       waiter->lock = lock;
14756 -       waiter->prio = task->prio;
14757 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14758 +               return;
14759 +       else
14760 +               slowfn(lock, do_mig_dis);
14761 +}
14762
14763 -       /* Get the top priority waiter on the lock */
14764 -       if (rt_mutex_has_waiters(lock))
14765 -               top_waiter = rt_mutex_top_waiter(lock);
14766 -       rt_mutex_enqueue(lock, waiter);
14767 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
14768 +                                          void  (*slowfn)(struct rt_mutex *lock))
14769 +{
14770 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
14771 +               return;
14772 +       else
14773 +               slowfn(lock);
14774 +}
14775 +#ifdef CONFIG_SMP
14776 +/*
14777 + * Note that owner is a speculative pointer and dereferencing relies
14778 + * on rcu_read_lock() and the check against the lock owner.
14779 + */
14780 +static int adaptive_wait(struct rt_mutex *lock,
14781 +                        struct task_struct *owner)
14782 +{
14783 +       int res = 0;
14784
14785 -       task->pi_blocked_on = waiter;
14786 +       rcu_read_lock();
14787 +       for (;;) {
14788 +               if (owner != rt_mutex_owner(lock))
14789 +                       break;
14790 +               /*
14791 +                * Ensure that owner->on_cpu is dereferenced _after_
14792 +                * checking the above to be valid.
14793 +                */
14794 +               barrier();
14795 +               if (!owner->on_cpu) {
14796 +                       res = 1;
14797 +                       break;
14798 +               }
14799 +               cpu_relax();
14800 +       }
14801 +       rcu_read_unlock();
14802 +       return res;
14803 +}
14804 +#else
14805 +static int adaptive_wait(struct rt_mutex *lock,
14806 +                        struct task_struct *orig_owner)
14807 +{
14808 +       return 1;
14809 +}
14810 +#endif
14811
14812 -       raw_spin_unlock(&task->pi_lock);
14813 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14814 +                                  struct rt_mutex_waiter *waiter,
14815 +                                  struct task_struct *task,
14816 +                                  enum rtmutex_chainwalk chwalk);
14817 +/*
14818 + * Slow path lock function spin_lock style: this variant is very
14819 + * careful not to miss any non-lock wakeups.
14820 + *
14821 + * We store the current state under p->pi_lock in p->saved_state and
14822 + * the try_to_wake_up() code handles this accordingly.
14823 + */
14824 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
14825 +                                                   bool mg_off)
14826 +{
14827 +       struct task_struct *lock_owner, *self = current;
14828 +       struct rt_mutex_waiter waiter, *top_waiter;
14829 +       unsigned long flags;
14830 +       int ret;
14831
14832 -       if (!owner)
14833 -               return 0;
14834 +       rt_mutex_init_waiter(&waiter, true);
14835
14836 -       raw_spin_lock(&owner->pi_lock);
14837 -       if (waiter == rt_mutex_top_waiter(lock)) {
14838 -               rt_mutex_dequeue_pi(owner, top_waiter);
14839 -               rt_mutex_enqueue_pi(owner, waiter);
14840 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14841
14842 -               __rt_mutex_adjust_prio(owner);
14843 -               if (owner->pi_blocked_on)
14844 -                       chain_walk = 1;
14845 -       } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
14846 -               chain_walk = 1;
14847 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
14848 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14849 +               return;
14850         }
14851
14852 -       /* Store the lock on which owner is blocked or NULL */
14853 -       next_lock = task_blocked_on_lock(owner);
14854 +       BUG_ON(rt_mutex_owner(lock) == self);
14855
14856 -       raw_spin_unlock(&owner->pi_lock);
14857         /*
14858 -        * Even if full deadlock detection is on, if the owner is not
14859 -        * blocked itself, we can avoid finding this out in the chain
14860 -        * walk.
14861 +        * We save whatever state the task is in and we'll restore it
14862 +        * after acquiring the lock taking real wakeups into account
14863 +        * as well. We are serialized via pi_lock against wakeups. See
14864 +        * try_to_wake_up().
14865          */
14866 -       if (!chain_walk || !next_lock)
14867 -               return 0;
14868 +       raw_spin_lock(&self->pi_lock);
14869 +       self->saved_state = self->state;
14870 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14871 +       raw_spin_unlock(&self->pi_lock);
14872
14873 -       /*
14874 -        * The owner can't disappear while holding a lock,
14875 -        * so the owner struct is protected by wait_lock.
14876 -        * Gets dropped in rt_mutex_adjust_prio_chain()!
14877 -        */
14878 -       get_task_struct(owner);
14879 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
14880 +       BUG_ON(ret);
14881
14882 -       raw_spin_unlock_irq(&lock->wait_lock);
14883 +       for (;;) {
14884 +               /* Try to acquire the lock again. */
14885 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
14886 +                       break;
14887
14888 -       res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
14889 -                                        next_lock, waiter, task);
14890 +               top_waiter = rt_mutex_top_waiter(lock);
14891 +               lock_owner = rt_mutex_owner(lock);
14892
14893 -       raw_spin_lock_irq(&lock->wait_lock);
14894 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14895
14896 -       return res;
14897 -}
14898 +               debug_rt_mutex_print_deadlock(&waiter);
14899
14900 -/*
14901 - * Remove the top waiter from the current tasks pi waiter tree and
14902 - * queue it up.
14903 - *
14904 - * Called with lock->wait_lock held and interrupts disabled.
14905 - */
14906 -static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14907 -                                   struct rt_mutex *lock)
14908 -{
14909 -       struct rt_mutex_waiter *waiter;
14910 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
14911 +                       if (mg_off)
14912 +                               migrate_enable();
14913 +                       schedule();
14914 +                       if (mg_off)
14915 +                               migrate_disable();
14916 +               }
14917
14918 -       raw_spin_lock(&current->pi_lock);
14919 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
14920
14921 -       waiter = rt_mutex_top_waiter(lock);
14922 +               raw_spin_lock(&self->pi_lock);
14923 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14924 +               raw_spin_unlock(&self->pi_lock);
14925 +       }
14926
14927         /*
14928 -        * Remove it from current->pi_waiters. We do not adjust a
14929 -        * possible priority boost right now. We execute wakeup in the
14930 -        * boosted mode and go back to normal after releasing
14931 -        * lock->wait_lock.
14932 +        * Restore the task state to current->saved_state. We set it
14933 +        * to the original state above and the try_to_wake_up() code
14934 +        * has possibly updated it when a real (non-rtmutex) wakeup
14935 +        * happened while we were blocked. Clear saved_state so
14936 +        * try_to_wakeup() does not get confused.
14937          */
14938 -       rt_mutex_dequeue_pi(current, waiter);
14939 +       raw_spin_lock(&self->pi_lock);
14940 +       __set_current_state_no_track(self->saved_state);
14941 +       self->saved_state = TASK_RUNNING;
14942 +       raw_spin_unlock(&self->pi_lock);
14943
14944         /*
14945 -        * As we are waking up the top waiter, and the waiter stays
14946 -        * queued on the lock until it gets the lock, this lock
14947 -        * obviously has waiters. Just set the bit here and this has
14948 -        * the added benefit of forcing all new tasks into the
14949 -        * slow path making sure no task of lower priority than
14950 -        * the top waiter can steal this lock.
14951 +        * try_to_take_rt_mutex() sets the waiter bit
14952 +        * unconditionally. We might have to fix that up:
14953          */
14954 -       lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
14955 +       fixup_rt_mutex_waiters(lock);
14956
14957 -       raw_spin_unlock(&current->pi_lock);
14958 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
14959 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
14960
14961 -       wake_q_add(wake_q, waiter->task);
14962 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14963 +
14964 +       debug_rt_mutex_free_waiter(&waiter);
14965  }
14966
14967 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
14968 +                                            struct wake_q_head *wake_q,
14969 +                                            struct wake_q_head *wq_sleeper);
14970  /*
14971 - * Remove a waiter from a lock and give up
14972 - *
14973 - * Must be called with lock->wait_lock held and interrupts disabled. I must
14974 - * have just failed to try_to_take_rt_mutex().
14975 + * Slow path to release a rt_mutex spin_lock style
14976   */
14977 -static void remove_waiter(struct rt_mutex *lock,
14978 -                         struct rt_mutex_waiter *waiter)
14979 +static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
14980  {
14981 -       bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
14982 -       struct task_struct *owner = rt_mutex_owner(lock);
14983 -       struct rt_mutex *next_lock;
14984 +       unsigned long flags;
14985 +       WAKE_Q(wake_q);
14986 +       WAKE_Q(wake_sleeper_q);
14987 +       bool postunlock;
14988
14989 -       raw_spin_lock(&current->pi_lock);
14990 -       rt_mutex_dequeue(lock, waiter);
14991 -       current->pi_blocked_on = NULL;
14992 -       raw_spin_unlock(&current->pi_lock);
14993 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14994 +       postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
14995 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14996
14997 -       /*
14998 -        * Only update priority if the waiter was the highest priority
14999 -        * waiter of the lock and there is an owner to update.
15000 -        */
15001 -       if (!owner || !is_top_waiter)
15002 -               return;
15003 +       if (postunlock)
15004 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15005 +}
15006
15007 -       raw_spin_lock(&owner->pi_lock);
15008 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
15009 +{
15010 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
15011 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
15012 +}
15013 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
15014
15015 -       rt_mutex_dequeue_pi(owner, waiter);
15016 +void __lockfunc rt_spin_lock(spinlock_t *lock)
15017 +{
15018 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
15019 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
15020 +}
15021 +EXPORT_SYMBOL(rt_spin_lock);
15022
15023 -       if (rt_mutex_has_waiters(lock))
15024 -               rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
15025 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
15026 +{
15027 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
15028 +}
15029 +EXPORT_SYMBOL(__rt_spin_lock);
15030
15031 -       __rt_mutex_adjust_prio(owner);
15032 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
15033 +{
15034 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
15035 +}
15036 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
15037
15038 -       /* Store the lock on which owner is blocked or NULL */
15039 -       next_lock = task_blocked_on_lock(owner);
15040 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15041 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
15042 +{
15043 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
15044 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
15045 +}
15046 +EXPORT_SYMBOL(rt_spin_lock_nested);
15047 +#endif
15048
15049 -       raw_spin_unlock(&owner->pi_lock);
15050 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
15051 +{
15052 +       /* NOTE: we always pass in '1' for nested, for simplicity */
15053 +       spin_release(&lock->dep_map, 1, _RET_IP_);
15054 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
15055 +}
15056 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
15057
15058 -       /*
15059 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
15060 +{
15061 +       /* NOTE: we always pass in '1' for nested, for simplicity */
15062 +       spin_release(&lock->dep_map, 1, _RET_IP_);
15063 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
15064 +       migrate_enable();
15065 +}
15066 +EXPORT_SYMBOL(rt_spin_unlock);
15067 +
15068 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
15069 +{
15070 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
15071 +}
15072 +EXPORT_SYMBOL(__rt_spin_unlock);
15073 +
15074 +/*
15075 + * Wait for the lock to get unlocked: instead of polling for an unlock
15076 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
15077 + * schedule if there's contention:
15078 + */
15079 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
15080 +{
15081 +       spin_lock(lock);
15082 +       spin_unlock(lock);
15083 +}
15084 +EXPORT_SYMBOL(rt_spin_unlock_wait);
15085 +
15086 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
15087 +{
15088 +       int ret;
15089 +
15090 +       ret = rt_mutex_trylock(&lock->lock);
15091 +       if (ret)
15092 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15093 +       return ret;
15094 +}
15095 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
15096 +
15097 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
15098 +{
15099 +       int ret;
15100 +
15101 +       migrate_disable();
15102 +       ret = rt_mutex_trylock(&lock->lock);
15103 +       if (ret)
15104 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15105 +       else
15106 +               migrate_enable();
15107 +       return ret;
15108 +}
15109 +EXPORT_SYMBOL(rt_spin_trylock);
15110 +
15111 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
15112 +{
15113 +       int ret;
15114 +
15115 +       local_bh_disable();
15116 +       ret = rt_mutex_trylock(&lock->lock);
15117 +       if (ret) {
15118 +               migrate_disable();
15119 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15120 +       } else
15121 +               local_bh_enable();
15122 +       return ret;
15123 +}
15124 +EXPORT_SYMBOL(rt_spin_trylock_bh);
15125 +
15126 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
15127 +{
15128 +       int ret;
15129 +
15130 +       *flags = 0;
15131 +       ret = rt_mutex_trylock(&lock->lock);
15132 +       if (ret) {
15133 +               migrate_disable();
15134 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
15135 +       }
15136 +       return ret;
15137 +}
15138 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
15139 +
15140 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
15141 +{
15142 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
15143 +       if (atomic_add_unless(atomic, -1, 1))
15144 +               return 0;
15145 +       rt_spin_lock(lock);
15146 +       if (atomic_dec_and_test(atomic))
15147 +               return 1;
15148 +       rt_spin_unlock(lock);
15149 +       return 0;
15150 +}
15151 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
15152 +
15153 +       void
15154 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
15155 +{
15156 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15157 +       /*
15158 +        * Make sure we are not reinitializing a held lock:
15159 +        */
15160 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
15161 +       lockdep_init_map(&lock->dep_map, name, key, 0);
15162 +#endif
15163 +}
15164 +EXPORT_SYMBOL(__rt_spin_lock_init);
15165 +
15166 +#endif /* PREEMPT_RT_FULL */
15167 +
15168 +#ifdef CONFIG_PREEMPT_RT_FULL
15169 +       static inline int __sched
15170 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
15171 +{
15172 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
15173 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
15174 +
15175 +       if (!hold_ctx)
15176 +               return 0;
15177 +
15178 +       if (unlikely(ctx == hold_ctx))
15179 +               return -EALREADY;
15180 +
15181 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
15182 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
15183 +#ifdef CONFIG_DEBUG_MUTEXES
15184 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
15185 +               ctx->contending_lock = ww;
15186 +#endif
15187 +               return -EDEADLK;
15188 +       }
15189 +
15190 +       return 0;
15191 +}
15192 +#else
15193 +       static inline int __sched
15194 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
15195 +{
15196 +       BUG();
15197 +       return 0;
15198 +}
15199 +
15200 +#endif
15201 +
15202 +static inline int
15203 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
15204 +                    struct rt_mutex_waiter *waiter)
15205 +{
15206 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
15207 +}
15208 +
15209 +/*
15210 + * Task blocks on lock.
15211 + *
15212 + * Prepare waiter and propagate pi chain
15213 + *
15214 + * This must be called with lock->wait_lock held and interrupts disabled
15215 + */
15216 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
15217 +                                  struct rt_mutex_waiter *waiter,
15218 +                                  struct task_struct *task,
15219 +                                  enum rtmutex_chainwalk chwalk)
15220 +{
15221 +       struct task_struct *owner = rt_mutex_owner(lock);
15222 +       struct rt_mutex_waiter *top_waiter = waiter;
15223 +       struct rt_mutex *next_lock;
15224 +       int chain_walk = 0, res;
15225 +
15226 +       lockdep_assert_held(&lock->wait_lock);
15227 +
15228 +       /*
15229 +        * Early deadlock detection. We really don't want the task to
15230 +        * enqueue on itself just to untangle the mess later. It's not
15231 +        * only an optimization. We drop the locks, so another waiter
15232 +        * can come in before the chain walk detects the deadlock. So
15233 +        * the other will detect the deadlock and return -EDEADLOCK,
15234 +        * which is wrong, as the other waiter is not in a deadlock
15235 +        * situation.
15236 +        */
15237 +       if (owner == task)
15238 +               return -EDEADLK;
15239 +
15240 +       raw_spin_lock(&task->pi_lock);
15241 +
15242 +       /*
15243 +        * In the case of futex requeue PI, this will be a proxy
15244 +        * lock. The task will wake unaware that it is enqueueed on
15245 +        * this lock. Avoid blocking on two locks and corrupting
15246 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
15247 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
15248 +        * before requeue (due to a signal or timeout). Do not enqueue
15249 +        * the task if PI_WAKEUP_INPROGRESS is set.
15250 +        */
15251 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
15252 +               raw_spin_unlock(&task->pi_lock);
15253 +               return -EAGAIN;
15254 +       }
15255 +
15256 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
15257 +
15258 +       rt_mutex_adjust_prio(task);
15259 +       waiter->task = task;
15260 +       waiter->lock = lock;
15261 +       waiter->prio = task->prio;
15262 +       waiter->deadline = task->dl.deadline;
15263 +
15264 +       /* Get the top priority waiter on the lock */
15265 +       if (rt_mutex_has_waiters(lock))
15266 +               top_waiter = rt_mutex_top_waiter(lock);
15267 +       rt_mutex_enqueue(lock, waiter);
15268 +
15269 +       task->pi_blocked_on = waiter;
15270 +
15271 +       raw_spin_unlock(&task->pi_lock);
15272 +
15273 +       if (!owner)
15274 +               return 0;
15275 +
15276 +       raw_spin_lock(&owner->pi_lock);
15277 +       if (waiter == rt_mutex_top_waiter(lock)) {
15278 +               rt_mutex_dequeue_pi(owner, top_waiter);
15279 +               rt_mutex_enqueue_pi(owner, waiter);
15280 +
15281 +               rt_mutex_adjust_prio(owner);
15282 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
15283 +                       chain_walk = 1;
15284 +       } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
15285 +               chain_walk = 1;
15286 +       }
15287 +
15288 +       /* Store the lock on which owner is blocked or NULL */
15289 +       next_lock = task_blocked_on_lock(owner);
15290 +
15291 +       raw_spin_unlock(&owner->pi_lock);
15292 +       /*
15293 +        * Even if full deadlock detection is on, if the owner is not
15294 +        * blocked itself, we can avoid finding this out in the chain
15295 +        * walk.
15296 +        */
15297 +       if (!chain_walk || !next_lock)
15298 +               return 0;
15299 +
15300 +       /*
15301 +        * The owner can't disappear while holding a lock,
15302 +        * so the owner struct is protected by wait_lock.
15303 +        * Gets dropped in rt_mutex_adjust_prio_chain()!
15304 +        */
15305 +       get_task_struct(owner);
15306 +
15307 +       raw_spin_unlock_irq(&lock->wait_lock);
15308 +
15309 +       res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
15310 +                                        next_lock, waiter, task);
15311 +
15312 +       raw_spin_lock_irq(&lock->wait_lock);
15313 +
15314 +       return res;
15315 +}
15316 +
15317 +/*
15318 + * Remove the top waiter from the current tasks pi waiter tree and
15319 + * queue it up.
15320 + *
15321 + * Called with lock->wait_lock held and interrupts disabled.
15322 + */
15323 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
15324 +                                   struct wake_q_head *wake_sleeper_q,
15325 +                                   struct rt_mutex *lock)
15326 +{
15327 +       struct rt_mutex_waiter *waiter;
15328 +
15329 +       raw_spin_lock(&current->pi_lock);
15330 +
15331 +       waiter = rt_mutex_top_waiter(lock);
15332 +
15333 +       /*
15334 +        * Remove it from current->pi_waiters and deboost.
15335 +        *
15336 +        * We must in fact deboost here in order to ensure we call
15337 +        * rt_mutex_setprio() to update p->pi_top_task before the
15338 +        * task unblocks.
15339 +        */
15340 +       rt_mutex_dequeue_pi(current, waiter);
15341 +       rt_mutex_adjust_prio(current);
15342 +
15343 +       /*
15344 +        * As we are waking up the top waiter, and the waiter stays
15345 +        * queued on the lock until it gets the lock, this lock
15346 +        * obviously has waiters. Just set the bit here and this has
15347 +        * the added benefit of forcing all new tasks into the
15348 +        * slow path making sure no task of lower priority than
15349 +        * the top waiter can steal this lock.
15350 +        */
15351 +       lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
15352 +
15353 +       /*
15354 +        * We deboosted before waking the top waiter task such that we don't
15355 +        * run two tasks with the 'same' priority (and ensure the
15356 +        * p->pi_top_task pointer points to a blocked task). This however can
15357 +        * lead to priority inversion if we would get preempted after the
15358 +        * deboost but before waking our donor task, hence the preempt_disable()
15359 +        * before unlock.
15360 +        *
15361 +        * Pairs with preempt_enable() in rt_mutex_postunlock();
15362 +        */
15363 +       preempt_disable();
15364 +       if (waiter->savestate)
15365 +               wake_q_add_sleeper(wake_sleeper_q, waiter->task);
15366 +       else
15367 +               wake_q_add(wake_q, waiter->task);
15368 +       raw_spin_unlock(&current->pi_lock);
15369 +}
15370 +
15371 +/*
15372 + * Remove a waiter from a lock and give up
15373 + *
15374 + * Must be called with lock->wait_lock held and interrupts disabled. I must
15375 + * have just failed to try_to_take_rt_mutex().
15376 + */
15377 +static void remove_waiter(struct rt_mutex *lock,
15378 +                         struct rt_mutex_waiter *waiter)
15379 +{
15380 +       bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
15381 +       struct task_struct *owner = rt_mutex_owner(lock);
15382 +       struct rt_mutex *next_lock = NULL;
15383 +
15384 +       lockdep_assert_held(&lock->wait_lock);
15385 +
15386 +       raw_spin_lock(&current->pi_lock);
15387 +       rt_mutex_dequeue(lock, waiter);
15388 +       current->pi_blocked_on = NULL;
15389 +       raw_spin_unlock(&current->pi_lock);
15390 +
15391 +       /*
15392 +        * Only update priority if the waiter was the highest priority
15393 +        * waiter of the lock and there is an owner to update.
15394 +        */
15395 +       if (!owner || !is_top_waiter)
15396 +               return;
15397 +
15398 +       raw_spin_lock(&owner->pi_lock);
15399 +
15400 +       rt_mutex_dequeue_pi(owner, waiter);
15401 +
15402 +       if (rt_mutex_has_waiters(lock))
15403 +               rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
15404 +
15405 +       rt_mutex_adjust_prio(owner);
15406 +
15407 +       /* Store the lock on which owner is blocked or NULL */
15408 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
15409 +               next_lock = task_blocked_on_lock(owner);
15410 +
15411 +       raw_spin_unlock(&owner->pi_lock);
15412 +
15413 +       /*
15414          * Don't walk the chain, if the owner task is not blocked
15415          * itself.
15416          */
15417 @@ -1138,21 +1586,30 @@ void rt_mutex_adjust_pi(struct task_struct *task)
15418         raw_spin_lock_irqsave(&task->pi_lock, flags);
15419
15420         waiter = task->pi_blocked_on;
15421 -       if (!waiter || (waiter->prio == task->prio &&
15422 -                       !dl_prio(task->prio))) {
15423 +       if (!rt_mutex_real_waiter(waiter) ||
15424 +           rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
15425                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15426                 return;
15427         }
15428         next_lock = waiter->lock;
15429 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15430
15431         /* gets dropped in rt_mutex_adjust_prio_chain()! */
15432         get_task_struct(task);
15433
15434 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15435         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
15436                                    next_lock, NULL, task);
15437  }
15438
15439 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
15440 +{
15441 +       debug_rt_mutex_init_waiter(waiter);
15442 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
15443 +       RB_CLEAR_NODE(&waiter->tree_entry);
15444 +       waiter->task = NULL;
15445 +       waiter->savestate = savestate;
15446 +}
15447 +
15448  /**
15449   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
15450   * @lock:               the rt_mutex to take
15451 @@ -1166,7 +1623,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
15452  static int __sched
15453  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
15454                     struct hrtimer_sleeper *timeout,
15455 -                   struct rt_mutex_waiter *waiter)
15456 +                   struct rt_mutex_waiter *waiter,
15457 +                   struct ww_acquire_ctx *ww_ctx)
15458  {
15459         int ret = 0;
15460
15461 @@ -1175,16 +1633,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
15462                 if (try_to_take_rt_mutex(lock, current, waiter))
15463                         break;
15464
15465 -               /*
15466 -                * TASK_INTERRUPTIBLE checks for signals and
15467 -                * timeout. Ignored otherwise.
15468 -                */
15469 -               if (unlikely(state == TASK_INTERRUPTIBLE)) {
15470 -                       /* Signal pending? */
15471 -                       if (signal_pending(current))
15472 -                               ret = -EINTR;
15473 -                       if (timeout && !timeout->task)
15474 -                               ret = -ETIMEDOUT;
15475 +               if (timeout && !timeout->task) {
15476 +                       ret = -ETIMEDOUT;
15477 +                       break;
15478 +               }
15479 +               if (signal_pending_state(state, current)) {
15480 +                       ret = -EINTR;
15481 +                       break;
15482 +               }
15483 +
15484 +               if (ww_ctx && ww_ctx->acquired > 0) {
15485 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
15486                         if (ret)
15487                                 break;
15488                 }
15489 @@ -1223,35 +1682,94 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
15490         }
15491  }
15492
15493 -/*
15494 - * Slow path lock function:
15495 - */
15496 -static int __sched
15497 -rt_mutex_slowlock(struct rt_mutex *lock, int state,
15498 -                 struct hrtimer_sleeper *timeout,
15499 -                 enum rtmutex_chainwalk chwalk)
15500 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
15501 +                                                  struct ww_acquire_ctx *ww_ctx)
15502 +{
15503 +#ifdef CONFIG_DEBUG_MUTEXES
15504 +       /*
15505 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
15506 +        * but released with a normal mutex_unlock in this call.
15507 +        *
15508 +        * This should never happen, always use ww_mutex_unlock.
15509 +        */
15510 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
15511 +
15512 +       /*
15513 +        * Not quite done after calling ww_acquire_done() ?
15514 +        */
15515 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
15516 +
15517 +       if (ww_ctx->contending_lock) {
15518 +               /*
15519 +                * After -EDEADLK you tried to
15520 +                * acquire a different ww_mutex? Bad!
15521 +                */
15522 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
15523 +
15524 +               /*
15525 +                * You called ww_mutex_lock after receiving -EDEADLK,
15526 +                * but 'forgot' to unlock everything else first?
15527 +                */
15528 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
15529 +               ww_ctx->contending_lock = NULL;
15530 +       }
15531 +
15532 +       /*
15533 +        * Naughty, using a different class will lead to undefined behavior!
15534 +        */
15535 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
15536 +#endif
15537 +       ww_ctx->acquired++;
15538 +}
15539 +
15540 +#ifdef CONFIG_PREEMPT_RT_FULL
15541 +static void ww_mutex_account_lock(struct rt_mutex *lock,
15542 +                                 struct ww_acquire_ctx *ww_ctx)
15543  {
15544 -       struct rt_mutex_waiter waiter;
15545 -       unsigned long flags;
15546 -       int ret = 0;
15547 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
15548 +       struct rt_mutex_waiter *waiter, *n;
15549
15550 -       debug_rt_mutex_init_waiter(&waiter);
15551 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
15552 -       RB_CLEAR_NODE(&waiter.tree_entry);
15553 +       /*
15554 +        * This branch gets optimized out for the common case,
15555 +        * and is only important for ww_mutex_lock.
15556 +        */
15557 +       ww_mutex_lock_acquired(ww, ww_ctx);
15558 +       ww->ctx = ww_ctx;
15559
15560         /*
15561 -        * Technically we could use raw_spin_[un]lock_irq() here, but this can
15562 -        * be called in early boot if the cmpxchg() fast path is disabled
15563 -        * (debug, no architecture support). In this case we will acquire the
15564 -        * rtmutex with lock->wait_lock held. But we cannot unconditionally
15565 -        * enable interrupts in that early boot case. So we need to use the
15566 -        * irqsave/restore variants.
15567 +        * Give any possible sleeping processes the chance to wake up,
15568 +        * so they can recheck if they have to back off.
15569          */
15570 -       raw_spin_lock_irqsave(&lock->wait_lock, flags);
15571 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
15572 +                                            tree_entry) {
15573 +               /* XXX debug rt mutex waiter wakeup */
15574 +
15575 +               BUG_ON(waiter->lock != lock);
15576 +               rt_mutex_wake_waiter(waiter);
15577 +       }
15578 +}
15579 +
15580 +#else
15581 +
15582 +static void ww_mutex_account_lock(struct rt_mutex *lock,
15583 +                                 struct ww_acquire_ctx *ww_ctx)
15584 +{
15585 +       BUG();
15586 +}
15587 +#endif
15588 +
15589 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
15590 +                                    struct hrtimer_sleeper *timeout,
15591 +                                    enum rtmutex_chainwalk chwalk,
15592 +                                    struct ww_acquire_ctx *ww_ctx,
15593 +                                    struct rt_mutex_waiter *waiter)
15594 +{
15595 +       int ret;
15596
15597         /* Try to acquire the lock again: */
15598         if (try_to_take_rt_mutex(lock, current, NULL)) {
15599 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15600 +               if (ww_ctx)
15601 +                       ww_mutex_account_lock(lock, ww_ctx);
15602                 return 0;
15603         }
15604
15605 @@ -1261,17 +1779,27 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
15606         if (unlikely(timeout))
15607                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
15608
15609 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
15610 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
15611
15612 -       if (likely(!ret))
15613 +       if (likely(!ret)) {
15614                 /* sleep on the mutex */
15615 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
15616 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
15617 +                                         ww_ctx);
15618 +       } else if (ww_ctx) {
15619 +               /* ww_mutex received EDEADLK, let it become EALREADY */
15620 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
15621 +               BUG_ON(!ret);
15622 +       }
15623
15624         if (unlikely(ret)) {
15625                 __set_current_state(TASK_RUNNING);
15626                 if (rt_mutex_has_waiters(lock))
15627 -                       remove_waiter(lock, &waiter);
15628 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
15629 +                       remove_waiter(lock, waiter);
15630 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
15631 +               if (!ww_ctx)
15632 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
15633 +       } else if (ww_ctx) {
15634 +               ww_mutex_account_lock(lock, ww_ctx);
15635         }
15636
15637         /*
15638 @@ -1279,6 +1807,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
15639          * unconditionally. We might have to fix that up.
15640          */
15641         fixup_rt_mutex_waiters(lock);
15642 +       return ret;
15643 +}
15644 +
15645 +/*
15646 + * Slow path lock function:
15647 + */
15648 +static int __sched
15649 +rt_mutex_slowlock(struct rt_mutex *lock, int state,
15650 +                 struct hrtimer_sleeper *timeout,
15651 +                 enum rtmutex_chainwalk chwalk,
15652 +                 struct ww_acquire_ctx *ww_ctx)
15653 +{
15654 +       struct rt_mutex_waiter waiter;
15655 +       unsigned long flags;
15656 +       int ret = 0;
15657 +
15658 +       rt_mutex_init_waiter(&waiter, false);
15659 +
15660 +       /*
15661 +        * Technically we could use raw_spin_[un]lock_irq() here, but this can
15662 +        * be called in early boot if the cmpxchg() fast path is disabled
15663 +        * (debug, no architecture support). In this case we will acquire the
15664 +        * rtmutex with lock->wait_lock held. But we cannot unconditionally
15665 +        * enable interrupts in that early boot case. So we need to use the
15666 +        * irqsave/restore variants.
15667 +        */
15668 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
15669 +
15670 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
15671 +                                      &waiter);
15672
15673         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15674
15675 @@ -1328,10 +1886,12 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
15676
15677  /*
15678   * Slow path to release a rt-mutex.
15679 - * Return whether the current task needs to undo a potential priority boosting.
15680 + *
15681 + * Return whether the current task needs to call rt_mutex_postunlock().
15682   */
15683  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15684 -                                       struct wake_q_head *wake_q)
15685 +                                       struct wake_q_head *wake_q,
15686 +                                       struct wake_q_head *wake_sleeper_q)
15687  {
15688         unsigned long flags;
15689
15690 @@ -1340,8 +1900,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15691
15692         debug_rt_mutex_unlock(lock);
15693
15694 -       rt_mutex_deadlock_account_unlock(current);
15695 -
15696         /*
15697          * We must be careful here if the fast path is enabled. If we
15698          * have no waiters queued we cannot set owner to NULL here
15699 @@ -1387,12 +1945,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15700          *
15701          * Queue the next waiter for wakeup once we release the wait_lock.
15702          */
15703 -       mark_wakeup_next_waiter(wake_q, lock);
15704 -
15705 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
15706         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15707
15708 -       /* check PI boosting */
15709 -       return true;
15710 +       return true; /* call rt_mutex_postunlock() */
15711  }
15712
15713  /*
15714 @@ -1403,63 +1959,97 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15715   */
15716  static inline int
15717  rt_mutex_fastlock(struct rt_mutex *lock, int state,
15718 +                 struct ww_acquire_ctx *ww_ctx,
15719                   int (*slowfn)(struct rt_mutex *lock, int state,
15720                                 struct hrtimer_sleeper *timeout,
15721 -                               enum rtmutex_chainwalk chwalk))
15722 +                               enum rtmutex_chainwalk chwalk,
15723 +                               struct ww_acquire_ctx *ww_ctx))
15724  {
15725 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15726 -               rt_mutex_deadlock_account_lock(lock, current);
15727 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15728                 return 0;
15729 -       } else
15730 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
15731 +
15732 +       /*
15733 +        * If rt_mutex blocks, the function sched_submit_work will not call
15734 +        * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true).
15735 +        * We must call blk_schedule_flush_plug here, if we don't call it,
15736 +        * a deadlock in device mapper may happen.
15737 +        */
15738 +       if (unlikely(blk_needs_flush_plug(current)))
15739 +               blk_schedule_flush_plug(current);
15740 +
15741 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
15742  }
15743
15744  static inline int
15745  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
15746                         struct hrtimer_sleeper *timeout,
15747                         enum rtmutex_chainwalk chwalk,
15748 +                       struct ww_acquire_ctx *ww_ctx,
15749                         int (*slowfn)(struct rt_mutex *lock, int state,
15750                                       struct hrtimer_sleeper *timeout,
15751 -                                     enum rtmutex_chainwalk chwalk))
15752 +                                     enum rtmutex_chainwalk chwalk,
15753 +                                     struct ww_acquire_ctx *ww_ctx))
15754  {
15755         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
15756 -           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15757 -               rt_mutex_deadlock_account_lock(lock, current);
15758 +           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15759                 return 0;
15760 -       } else
15761 -               return slowfn(lock, state, timeout, chwalk);
15762 +
15763 +       if (unlikely(blk_needs_flush_plug(current)))
15764 +               blk_schedule_flush_plug(current);
15765 +
15766 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
15767  }
15768
15769  static inline int
15770  rt_mutex_fasttrylock(struct rt_mutex *lock,
15771                      int (*slowfn)(struct rt_mutex *lock))
15772  {
15773 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15774 -               rt_mutex_deadlock_account_lock(lock, current);
15775 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15776                 return 1;
15777 -       }
15778 +
15779         return slowfn(lock);
15780  }
15781
15782 +/*
15783 + * Performs the wakeup of the the top-waiter and re-enables preemption.
15784 + */
15785 +void rt_mutex_postunlock(struct wake_q_head *wake_q,
15786 +                        struct wake_q_head *wq_sleeper)
15787 +{
15788 +       wake_up_q(wake_q);
15789 +       wake_up_q_sleeper(wq_sleeper);
15790 +
15791 +       /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
15792 +       preempt_enable();
15793 +}
15794 +
15795  static inline void
15796  rt_mutex_fastunlock(struct rt_mutex *lock,
15797                     bool (*slowfn)(struct rt_mutex *lock,
15798 -                                  struct wake_q_head *wqh))
15799 +                                  struct wake_q_head *wqh,
15800 +                                  struct wake_q_head *wq_sleeper))
15801  {
15802         WAKE_Q(wake_q);
15803 +       WAKE_Q(wake_sleeper_q);
15804
15805 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15806 -               rt_mutex_deadlock_account_unlock(current);
15807 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
15808 +               return;
15809
15810 -       } else {
15811 -               bool deboost = slowfn(lock, &wake_q);
15812 +       if (slowfn(lock, &wake_q,  &wake_sleeper_q))
15813 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15814 +}
15815
15816 -               wake_up_q(&wake_q);
15817 +/**
15818 + * rt_mutex_lock_state - lock a rt_mutex with a given state
15819 + *
15820 + * @lock:      The rt_mutex to be locked
15821 + * @state:     The state to set when blocking on the rt_mutex
15822 + */
15823 +int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
15824 +{
15825 +       might_sleep();
15826
15827 -               /* Undo pi boosting if necessary: */
15828 -               if (deboost)
15829 -                       rt_mutex_adjust_prio(current);
15830 -       }
15831 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
15832  }
15833
15834  /**
15835 @@ -1469,15 +2059,13 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
15836   */
15837  void __sched rt_mutex_lock(struct rt_mutex *lock)
15838  {
15839 -       might_sleep();
15840 -
15841 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
15842 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
15843  }
15844  EXPORT_SYMBOL_GPL(rt_mutex_lock);
15845
15846  /**
15847   * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
15848 - *
15849 + **
15850   * @lock:              the rt_mutex to be locked
15851   *
15852   * Returns:
15853 @@ -1486,23 +2074,32 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
15854   */
15855  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
15856  {
15857 -       might_sleep();
15858 -
15859 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
15860 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
15861  }
15862  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
15863
15864 -/*
15865 - * Futex variant with full deadlock detection.
15866 +/**
15867 + * rt_mutex_lock_killable - lock a rt_mutex killable
15868 + *
15869 + * @lock:              the rt_mutex to be locked
15870 + * @detect_deadlock:   deadlock detection on/off
15871 + *
15872 + * Returns:
15873 + *  0          on success
15874 + * -EINTR      when interrupted by a signal
15875   */
15876 -int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
15877 -                             struct hrtimer_sleeper *timeout)
15878 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
15879  {
15880 -       might_sleep();
15881 +       return rt_mutex_lock_state(lock, TASK_KILLABLE);
15882 +}
15883 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
15884
15885 -       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15886 -                                      RT_MUTEX_FULL_CHAINWALK,
15887 -                                      rt_mutex_slowlock);
15888 +/*
15889 + * Futex variant, must not use fastpath.
15890 + */
15891 +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
15892 +{
15893 +       return rt_mutex_slowtrylock(lock);
15894  }
15895
15896  /**
15897 @@ -1525,6 +2122,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
15898
15899         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15900                                        RT_MUTEX_MIN_CHAINWALK,
15901 +                                      NULL,
15902                                        rt_mutex_slowlock);
15903  }
15904  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15905 @@ -1542,7 +2140,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15906   */
15907  int __sched rt_mutex_trylock(struct rt_mutex *lock)
15908  {
15909 +#ifdef CONFIG_PREEMPT_RT_FULL
15910 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
15911 +#else
15912         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
15913 +#endif
15914                 return 0;
15915
15916         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
15917 @@ -1560,21 +2162,53 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
15918  }
15919  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
15920
15921 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
15922 +                                            struct wake_q_head *wake_q,
15923 +                                            struct wake_q_head *wq_sleeper)
15924 +{
15925 +       lockdep_assert_held(&lock->wait_lock);
15926 +
15927 +       debug_rt_mutex_unlock(lock);
15928 +
15929 +       if (!rt_mutex_has_waiters(lock)) {
15930 +               lock->owner = NULL;
15931 +               return false; /* done */
15932 +       }
15933 +
15934 +       /*
15935 +        * We've already deboosted, mark_wakeup_next_waiter() will
15936 +        * retain preempt_disabled when we drop the wait_lock, to
15937 +        * avoid inversion prior to the wakeup.  preempt_disable()
15938 +        * therein pairs with rt_mutex_postunlock().
15939 +        */
15940 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
15941 +
15942 +       return true; /* call postunlock() */
15943 +}
15944 +
15945  /**
15946 - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
15947 - * @lock: the rt_mutex to be unlocked
15948 - *
15949 - * Returns: true/false indicating whether priority adjustment is
15950 - * required or not.
15951 + * Futex variant, that since futex variants do not use the fast-path, can be
15952 + * simple and will not need to retry.
15953   */
15954 -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
15955 -                                  struct wake_q_head *wqh)
15956 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
15957 +                                   struct wake_q_head *wake_q,
15958 +                                   struct wake_q_head *wq_sleeper)
15959  {
15960 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15961 -               rt_mutex_deadlock_account_unlock(current);
15962 -               return false;
15963 -       }
15964 -       return rt_mutex_slowunlock(lock, wqh);
15965 +       return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
15966 +}
15967 +
15968 +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
15969 +{
15970 +       WAKE_Q(wake_q);
15971 +       WAKE_Q(wake_sleeper_q);
15972 +       bool postunlock;
15973 +
15974 +       raw_spin_lock_irq(&lock->wait_lock);
15975 +       postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
15976 +       raw_spin_unlock_irq(&lock->wait_lock);
15977 +
15978 +       if (postunlock)
15979 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15980  }
15981
15982  /**
15983 @@ -1607,13 +2241,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
15984  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
15985  {
15986         lock->owner = NULL;
15987 -       raw_spin_lock_init(&lock->wait_lock);
15988         lock->waiters = RB_ROOT;
15989         lock->waiters_leftmost = NULL;
15990
15991         debug_rt_mutex_init(lock, name);
15992  }
15993 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
15994 +EXPORT_SYMBOL(__rt_mutex_init);
15995
15996  /**
15997   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
15998 @@ -1628,10 +2261,9 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
15999  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
16000                                 struct task_struct *proxy_owner)
16001  {
16002 -       __rt_mutex_init(lock, NULL);
16003 +       rt_mutex_init(lock);
16004         debug_rt_mutex_proxy_lock(lock, proxy_owner);
16005         rt_mutex_set_owner(lock, proxy_owner);
16006 -       rt_mutex_deadlock_account_lock(lock, proxy_owner);
16007  }
16008
16009  /**
16010 @@ -1647,34 +2279,44 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
16011  {
16012         debug_rt_mutex_proxy_unlock(lock);
16013         rt_mutex_set_owner(lock, NULL);
16014 -       rt_mutex_deadlock_account_unlock(proxy_owner);
16015  }
16016
16017 -/**
16018 - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
16019 - * @lock:              the rt_mutex to take
16020 - * @waiter:            the pre-initialized rt_mutex_waiter
16021 - * @task:              the task to prepare
16022 - *
16023 - * Returns:
16024 - *  0 - task blocked on lock
16025 - *  1 - acquired the lock for task, caller should wake it up
16026 - * <0 - error
16027 - *
16028 - * Special API call for FUTEX_REQUEUE_PI support.
16029 - */
16030 -int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16031 +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16032                               struct rt_mutex_waiter *waiter,
16033                               struct task_struct *task)
16034  {
16035         int ret;
16036
16037 -       raw_spin_lock_irq(&lock->wait_lock);
16038 -
16039 -       if (try_to_take_rt_mutex(lock, task, NULL)) {
16040 -               raw_spin_unlock_irq(&lock->wait_lock);
16041 +       if (try_to_take_rt_mutex(lock, task, NULL))
16042                 return 1;
16043 +
16044 +#ifdef CONFIG_PREEMPT_RT_FULL
16045 +       /*
16046 +        * In PREEMPT_RT there's an added race.
16047 +        * If the task, that we are about to requeue, times out,
16048 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
16049 +        * to skip this task. But right after the task sets
16050 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
16051 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
16052 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
16053 +        * lock that it blocks on. We *must not* place this task
16054 +        * on this proxy lock in that case.
16055 +        *
16056 +        * To prevent this race, we first take the task's pi_lock
16057 +        * and check if it has updated its pi_blocked_on. If it has,
16058 +        * we assume that it woke up and we return -EAGAIN.
16059 +        * Otherwise, we set the task's pi_blocked_on to
16060 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
16061 +        * it will know that we are in the process of requeuing it.
16062 +        */
16063 +       raw_spin_lock(&task->pi_lock);
16064 +       if (task->pi_blocked_on) {
16065 +               raw_spin_unlock(&task->pi_lock);
16066 +               return -EAGAIN;
16067         }
16068 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
16069 +       raw_spin_unlock(&task->pi_lock);
16070 +#endif
16071
16072         /* We enforce deadlock detection for futexes */
16073         ret = task_blocks_on_rt_mutex(lock, waiter, task,
16074 @@ -1690,17 +2332,41 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16075                 ret = 0;
16076         }
16077
16078 -       if (unlikely(ret))
16079 +       if (ret && rt_mutex_has_waiters(lock))
16080                 remove_waiter(lock, waiter);
16081
16082 -       raw_spin_unlock_irq(&lock->wait_lock);
16083 -
16084         debug_rt_mutex_print_deadlock(waiter);
16085
16086         return ret;
16087  }
16088
16089  /**
16090 + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
16091 + * @lock:              the rt_mutex to take
16092 + * @waiter:            the pre-initialized rt_mutex_waiter
16093 + * @task:              the task to prepare
16094 + *
16095 + * Returns:
16096 + *  0 - task blocked on lock
16097 + *  1 - acquired the lock for task, caller should wake it up
16098 + * <0 - error
16099 + *
16100 + * Special API call for FUTEX_REQUEUE_PI support.
16101 + */
16102 +int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16103 +                             struct rt_mutex_waiter *waiter,
16104 +                             struct task_struct *task)
16105 +{
16106 +       int ret;
16107 +
16108 +       raw_spin_lock_irq(&lock->wait_lock);
16109 +       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
16110 +       raw_spin_unlock_irq(&lock->wait_lock);
16111 +
16112 +       return ret;
16113 +}
16114 +
16115 +/**
16116   * rt_mutex_next_owner - return the next owner of the lock
16117   *
16118   * @lock: the rt lock query
16119 @@ -1721,36 +2387,106 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
16120  }
16121
16122  /**
16123 - * rt_mutex_finish_proxy_lock() - Complete lock acquisition
16124 + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
16125   * @lock:              the rt_mutex we were woken on
16126   * @to:                        the timeout, null if none. hrtimer should already have
16127   *                     been started.
16128   * @waiter:            the pre-initialized rt_mutex_waiter
16129   *
16130 - * Complete the lock acquisition started our behalf by another thread.
16131 + * Wait for the the lock acquisition started on our behalf by
16132 + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
16133 + * rt_mutex_cleanup_proxy_lock().
16134   *
16135   * Returns:
16136   *  0 - success
16137   * <0 - error, one of -EINTR, -ETIMEDOUT
16138   *
16139 - * Special API call for PI-futex requeue support
16140 + * Special API call for PI-futex support
16141   */
16142 -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
16143 +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
16144                                struct hrtimer_sleeper *to,
16145                                struct rt_mutex_waiter *waiter)
16146  {
16147 +       struct task_struct *tsk = current;
16148         int ret;
16149
16150         raw_spin_lock_irq(&lock->wait_lock);
16151 -
16152 +       /* sleep on the mutex */
16153         set_current_state(TASK_INTERRUPTIBLE);
16154 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
16155 +       /*
16156 +        * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
16157 +        * have to fix that up.
16158 +        */
16159 +       fixup_rt_mutex_waiters(lock);
16160
16161 -       /* sleep on the mutex */
16162 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
16163 +       /*
16164 +        * RT has a problem here when the wait got interrupted by a timeout
16165 +        * or a signal. task->pi_blocked_on is still set. The task must
16166 +        * acquire the hash bucket lock when returning from this function.
16167 +        *
16168 +        * If the hash bucket lock is contended then the
16169 +        * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
16170 +        * task_blocks_on_rt_mutex() will trigger. This can be avoided by
16171 +        * clearing task->pi_blocked_on which removes the task from the
16172 +        * boosting chain of the rtmutex. That's correct because the task
16173 +        * is not longer blocked on it.
16174 +        */
16175 +       if (ret) {
16176 +               raw_spin_lock(&tsk->pi_lock);
16177 +               tsk->pi_blocked_on = NULL;
16178 +               raw_spin_unlock(&tsk->pi_lock);
16179 +       }
16180 +       raw_spin_unlock_irq(&lock->wait_lock);
16181
16182 -       if (unlikely(ret))
16183 -               remove_waiter(lock, waiter);
16184 +       return ret;
16185 +}
16186 +
16187 +/**
16188 + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
16189 + * @lock:              the rt_mutex we were woken on
16190 + * @waiter:            the pre-initialized rt_mutex_waiter
16191 + *
16192 + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
16193 + *
16194 + * Unless we acquired the lock; we're still enqueued on the wait-list and can
16195 + * in fact still be granted ownership until we're removed. Therefore we can
16196 + * find we are in fact the owner and must disregard the
16197 + * rt_mutex_wait_proxy_lock() failure.
16198 + *
16199 + * Returns:
16200 + *  true  - did the cleanup, we done.
16201 + *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
16202 + *          caller should disregards its return value.
16203 + *
16204 + * Special API call for PI-futex support
16205 + */
16206 +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
16207 +                                struct rt_mutex_waiter *waiter)
16208 +{
16209 +       bool cleanup = false;
16210
16211 +       raw_spin_lock_irq(&lock->wait_lock);
16212 +       /*
16213 +        * Do an unconditional try-lock, this deals with the lock stealing
16214 +        * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
16215 +        * sets a NULL owner.
16216 +        *
16217 +        * We're not interested in the return value, because the subsequent
16218 +        * test on rt_mutex_owner() will infer that. If the trylock succeeded,
16219 +        * we will own the lock and it will have removed the waiter. If we
16220 +        * failed the trylock, we're still not owner and we need to remove
16221 +        * ourselves.
16222 +        */
16223 +       try_to_take_rt_mutex(lock, current, waiter);
16224 +       /*
16225 +        * Unless we're the owner; we're still enqueued on the wait_list.
16226 +        * So check if we became owner, if not, take us off the wait_list.
16227 +        */
16228 +       if (rt_mutex_owner(lock) != current) {
16229 +               remove_waiter(lock, waiter);
16230 +               cleanup = true;
16231 +       }
16232         /*
16233          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
16234          * have to fix that up.
16235 @@ -1759,5 +2495,91 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
16236
16237         raw_spin_unlock_irq(&lock->wait_lock);
16238
16239 +       return cleanup;
16240 +}
16241 +
16242 +static inline int
16243 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
16244 +{
16245 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
16246 +       unsigned tmp;
16247 +
16248 +       if (ctx->deadlock_inject_countdown-- == 0) {
16249 +               tmp = ctx->deadlock_inject_interval;
16250 +               if (tmp > UINT_MAX/4)
16251 +                       tmp = UINT_MAX;
16252 +               else
16253 +                       tmp = tmp*2 + tmp + tmp/2;
16254 +
16255 +               ctx->deadlock_inject_interval = tmp;
16256 +               ctx->deadlock_inject_countdown = tmp;
16257 +               ctx->contending_lock = lock;
16258 +
16259 +               ww_mutex_unlock(lock);
16260 +
16261 +               return -EDEADLK;
16262 +       }
16263 +#endif
16264 +
16265 +       return 0;
16266 +}
16267 +
16268 +#ifdef CONFIG_PREEMPT_RT_FULL
16269 +int __sched
16270 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
16271 +{
16272 +       int ret;
16273 +
16274 +       might_sleep();
16275 +
16276 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
16277 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
16278 +       if (ret)
16279 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
16280 +       else if (!ret && ww_ctx->acquired > 1)
16281 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
16282 +
16283 +       return ret;
16284 +}
16285 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
16286 +
16287 +int __sched
16288 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
16289 +{
16290 +       int ret;
16291 +
16292 +       might_sleep();
16293 +
16294 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
16295 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
16296 +       if (ret)
16297 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
16298 +       else if (!ret && ww_ctx->acquired > 1)
16299 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
16300 +
16301         return ret;
16302  }
16303 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
16304 +
16305 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
16306 +{
16307 +       int nest = !!lock->ctx;
16308 +
16309 +       /*
16310 +        * The unlocking fastpath is the 0->1 transition from 'locked'
16311 +        * into 'unlocked' state:
16312 +        */
16313 +       if (nest) {
16314 +#ifdef CONFIG_DEBUG_MUTEXES
16315 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
16316 +#endif
16317 +               if (lock->ctx->acquired > 0)
16318 +                       lock->ctx->acquired--;
16319 +               lock->ctx = NULL;
16320 +       }
16321 +
16322 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
16323 +       rt_mutex_unlock(&lock->base.lock);
16324 +}
16325 +EXPORT_SYMBOL(ww_mutex_unlock);
16326 +#endif
16327 diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
16328 index c4060584c407..6607802efa8b 100644
16329 --- a/kernel/locking/rtmutex.h
16330 +++ b/kernel/locking/rtmutex.h
16331 @@ -11,8 +11,6 @@
16332   */
16333
16334  #define rt_mutex_deadlock_check(l)                     (0)
16335 -#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
16336 -#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
16337  #define debug_rt_mutex_init_waiter(w)                  do { } while (0)
16338  #define debug_rt_mutex_free_waiter(w)                  do { } while (0)
16339  #define debug_rt_mutex_lock(l)                         do { } while (0)
16340 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
16341 index e317e1cbb3eb..64d89d780059 100644
16342 --- a/kernel/locking/rtmutex_common.h
16343 +++ b/kernel/locking/rtmutex_common.h
16344 @@ -27,12 +27,14 @@ struct rt_mutex_waiter {
16345         struct rb_node          pi_tree_entry;
16346         struct task_struct      *task;
16347         struct rt_mutex         *lock;
16348 +       bool                    savestate;
16349  #ifdef CONFIG_DEBUG_RT_MUTEXES
16350         unsigned long           ip;
16351         struct pid              *deadlock_task_pid;
16352         struct rt_mutex         *deadlock_lock;
16353  #endif
16354         int prio;
16355 +       u64 deadline;
16356  };
16357
16358  /*
16359 @@ -98,21 +100,45 @@ enum rtmutex_chainwalk {
16360  /*
16361   * PI-futex support (proxy locking functions, etc.):
16362   */
16363 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
16364 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
16365 +
16366  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
16367  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
16368                                        struct task_struct *proxy_owner);
16369  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
16370                                   struct task_struct *proxy_owner);
16371 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
16372 +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16373 +                                    struct rt_mutex_waiter *waiter,
16374 +                                    struct task_struct *task);
16375  extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
16376                                      struct rt_mutex_waiter *waiter,
16377                                      struct task_struct *task);
16378 -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
16379 -                                     struct hrtimer_sleeper *to,
16380 -                                     struct rt_mutex_waiter *waiter);
16381 -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
16382 -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
16383 -                                 struct wake_q_head *wqh);
16384 -extern void rt_mutex_adjust_prio(struct task_struct *task);
16385 +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
16386 +                              struct hrtimer_sleeper *to,
16387 +                              struct rt_mutex_waiter *waiter);
16388 +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
16389 +                                struct rt_mutex_waiter *waiter);
16390 +
16391 +extern int rt_mutex_futex_trylock(struct rt_mutex *l);
16392 +
16393 +extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
16394 +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
16395 +                                struct wake_q_head *wqh,
16396 +                                struct wake_q_head *wq_sleeper);
16397 +
16398 +extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
16399 +                               struct wake_q_head *wq_sleeper);
16400 +
16401 +/* RW semaphore special interface */
16402 +struct ww_acquire_ctx;
16403 +
16404 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
16405 +                                    struct hrtimer_sleeper *timeout,
16406 +                                    enum rtmutex_chainwalk chwalk,
16407 +                                    struct ww_acquire_ctx *ww_ctx,
16408 +                                    struct rt_mutex_waiter *waiter);
16409
16410  #ifdef CONFIG_DEBUG_RT_MUTEXES
16411  # include "rtmutex-debug.h"
16412 diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
16413 new file mode 100644
16414 index 000000000000..4a708ffcded6
16415 --- /dev/null
16416 +++ b/kernel/locking/rwsem-rt.c
16417 @@ -0,0 +1,268 @@
16418 +/*
16419 + */
16420 +#include <linux/rwsem.h>
16421 +#include <linux/sched.h>
16422 +#include <linux/export.h>
16423 +
16424 +#include "rtmutex_common.h"
16425 +
16426 +/*
16427 + * RT-specific reader/writer semaphores
16428 + *
16429 + * down_write()
16430 + *  1) Lock sem->rtmutex
16431 + *  2) Remove the reader BIAS to force readers into the slow path
16432 + *  3) Wait until all readers have left the critical region
16433 + *  4) Mark it write locked
16434 + *
16435 + * up_write()
16436 + *  1) Remove the write locked marker
16437 + *  2) Set the reader BIAS so readers can use the fast path again
16438 + *  3) Unlock sem->rtmutex to release blocked readers
16439 + *
16440 + * down_read()
16441 + *  1) Try fast path acquisition (reader BIAS is set)
16442 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
16443 + *  3) If !writelocked, acquire it for read
16444 + *  4) If writelocked, block on sem->rtmutex
16445 + *  5) unlock sem->rtmutex, goto 1)
16446 + *
16447 + * up_read()
16448 + *  1) Try fast path release (reader count != 1)
16449 + *  2) Wake the writer waiting in down_write()#3
16450 + *
16451 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
16452 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
16453 + * are subject to the rtmutex priority/DL inheritance mechanism.
16454 + *
16455 + * It's possible to make the rw semaphores writer fair by keeping a list of
16456 + * active readers. A blocked writer would force all newly incoming readers to
16457 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
16458 + * reader after the other. We can't use multi-reader inheritance because there
16459 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
16460 + * reader boosting/handover mechanism is a major surgery for a very dubious
16461 + * value.
16462 + *
16463 + * The risk of writer starvation is there, but the pathological use cases
16464 + * which trigger it are not necessarily the typical RT workloads.
16465 + */
16466 +
16467 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
16468 +                 struct lock_class_key *key)
16469 +{
16470 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
16471 +       /*
16472 +        * Make sure we are not reinitializing a held semaphore:
16473 +        */
16474 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
16475 +       lockdep_init_map(&sem->dep_map, name, key, 0);
16476 +#endif
16477 +       atomic_set(&sem->readers, READER_BIAS);
16478 +}
16479 +EXPORT_SYMBOL(__rwsem_init);
16480 +
16481 +int __down_read_trylock(struct rw_semaphore *sem)
16482 +{
16483 +       int r, old;
16484 +
16485 +       /*
16486 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
16487 +        * set.
16488 +        */
16489 +       for (r = atomic_read(&sem->readers); r < 0;) {
16490 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
16491 +               if (likely(old == r))
16492 +                       return 1;
16493 +               r = old;
16494 +       }
16495 +       return 0;
16496 +}
16497 +
16498 +void __sched __down_read(struct rw_semaphore *sem)
16499 +{
16500 +       struct rt_mutex *m = &sem->rtmutex;
16501 +       struct rt_mutex_waiter waiter;
16502 +
16503 +       if (__down_read_trylock(sem))
16504 +               return;
16505 +
16506 +       might_sleep();
16507 +       raw_spin_lock_irq(&m->wait_lock);
16508 +       /*
16509 +        * Allow readers as long as the writer has not completely
16510 +        * acquired the semaphore for write.
16511 +        */
16512 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
16513 +               atomic_inc(&sem->readers);
16514 +               raw_spin_unlock_irq(&m->wait_lock);
16515 +               return;
16516 +       }
16517 +
16518 +       /*
16519 +        * Call into the slow lock path with the rtmutex->wait_lock
16520 +        * held, so this can't result in the following race:
16521 +        *
16522 +        * Reader1              Reader2         Writer
16523 +        *                      down_read()
16524 +        *                                      down_write()
16525 +        *                                      rtmutex_lock(m)
16526 +        *                                      swait()
16527 +        * down_read()
16528 +        * unlock(m->wait_lock)
16529 +        *                      up_read()
16530 +        *                      swake()
16531 +        *                                      lock(m->wait_lock)
16532 +        *                                      sem->writelocked=true
16533 +        *                                      unlock(m->wait_lock)
16534 +        *
16535 +        *                                      up_write()
16536 +        *                                      sem->writelocked=false
16537 +        *                                      rtmutex_unlock(m)
16538 +        *                      down_read()
16539 +        *                                      down_write()
16540 +        *                                      rtmutex_lock(m)
16541 +        *                                      swait()
16542 +        * rtmutex_lock(m)
16543 +        *
16544 +        * That would put Reader1 behind the writer waiting on
16545 +        * Reader2 to call up_read() which might be unbound.
16546 +        */
16547 +       rt_mutex_init_waiter(&waiter, false);
16548 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
16549 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
16550 +                                &waiter);
16551 +       /*
16552 +        * The slowlock() above is guaranteed to return with the rtmutex is
16553 +        * now held, so there can't be a writer active. Increment the reader
16554 +        * count and immediately drop the rtmutex again.
16555 +        */
16556 +       atomic_inc(&sem->readers);
16557 +       raw_spin_unlock_irq(&m->wait_lock);
16558 +       rt_mutex_unlock(m);
16559 +
16560 +       debug_rt_mutex_free_waiter(&waiter);
16561 +}
16562 +
16563 +void __up_read(struct rw_semaphore *sem)
16564 +{
16565 +       struct rt_mutex *m = &sem->rtmutex;
16566 +       struct task_struct *tsk;
16567 +
16568 +       /*
16569 +        * sem->readers can only hit 0 when a writer is waiting for the
16570 +        * active readers to leave the critical region.
16571 +        */
16572 +       if (!atomic_dec_and_test(&sem->readers))
16573 +               return;
16574 +
16575 +       might_sleep();
16576 +       raw_spin_lock_irq(&m->wait_lock);
16577 +       /*
16578 +        * Wake the writer, i.e. the rtmutex owner. It might release the
16579 +        * rtmutex concurrently in the fast path (due to a signal), but to
16580 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
16581 +        * case which can happen is a spurious wakeup.
16582 +        */
16583 +       tsk = rt_mutex_owner(m);
16584 +       if (tsk)
16585 +               wake_up_process(tsk);
16586 +
16587 +       raw_spin_unlock_irq(&m->wait_lock);
16588 +}
16589 +
16590 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
16591 +                             unsigned long flags)
16592 +{
16593 +       struct rt_mutex *m = &sem->rtmutex;
16594 +
16595 +       atomic_add(READER_BIAS - bias, &sem->readers);
16596 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16597 +       rt_mutex_unlock(m);
16598 +}
16599 +
16600 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
16601 +{
16602 +       struct rt_mutex *m = &sem->rtmutex;
16603 +       unsigned long flags;
16604 +
16605 +       /* Take the rtmutex as a first step */
16606 +       if (rt_mutex_lock_state(m, state))
16607 +               return -EINTR;
16608 +
16609 +       /* Force readers into slow path */
16610 +       atomic_sub(READER_BIAS, &sem->readers);
16611 +       might_sleep();
16612 +
16613 +       set_current_state(state);
16614 +       for (;;) {
16615 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
16616 +               /* Have all readers left the critical region? */
16617 +               if (!atomic_read(&sem->readers)) {
16618 +                       atomic_set(&sem->readers, WRITER_BIAS);
16619 +                       __set_current_state(TASK_RUNNING);
16620 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16621 +                       return 0;
16622 +               }
16623 +
16624 +               if (signal_pending_state(state, current)) {
16625 +                       __set_current_state(TASK_RUNNING);
16626 +                       __up_write_unlock(sem, 0, flags);
16627 +                       return -EINTR;
16628 +               }
16629 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16630 +
16631 +               if (atomic_read(&sem->readers) != 0) {
16632 +                       schedule();
16633 +                       set_current_state(state);
16634 +               }
16635 +       }
16636 +}
16637 +
16638 +void __sched __down_write(struct rw_semaphore *sem)
16639 +{
16640 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
16641 +}
16642 +
16643 +int __sched __down_write_killable(struct rw_semaphore *sem)
16644 +{
16645 +       return __down_write_common(sem, TASK_KILLABLE);
16646 +}
16647 +
16648 +int __down_write_trylock(struct rw_semaphore *sem)
16649 +{
16650 +       struct rt_mutex *m = &sem->rtmutex;
16651 +       unsigned long flags;
16652 +
16653 +       if (!rt_mutex_trylock(m))
16654 +               return 0;
16655 +
16656 +       atomic_sub(READER_BIAS, &sem->readers);
16657 +
16658 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16659 +       if (!atomic_read(&sem->readers)) {
16660 +               atomic_set(&sem->readers, WRITER_BIAS);
16661 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16662 +               return 1;
16663 +       }
16664 +       __up_write_unlock(sem, 0, flags);
16665 +       return 0;
16666 +}
16667 +
16668 +void __up_write(struct rw_semaphore *sem)
16669 +{
16670 +       struct rt_mutex *m = &sem->rtmutex;
16671 +       unsigned long flags;
16672 +
16673 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16674 +       __up_write_unlock(sem, WRITER_BIAS, flags);
16675 +}
16676 +
16677 +void __downgrade_write(struct rw_semaphore *sem)
16678 +{
16679 +       struct rt_mutex *m = &sem->rtmutex;
16680 +       unsigned long flags;
16681 +
16682 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16683 +       /* Release it and account current as reader */
16684 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
16685 +}
16686 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
16687 index db3ccb1dd614..909779647bd1 100644
16688 --- a/kernel/locking/spinlock.c
16689 +++ b/kernel/locking/spinlock.c
16690 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
16691   *         __[spin|read|write]_lock_bh()
16692   */
16693  BUILD_LOCK_OPS(spin, raw_spinlock);
16694 +
16695 +#ifndef CONFIG_PREEMPT_RT_FULL
16696  BUILD_LOCK_OPS(read, rwlock);
16697  BUILD_LOCK_OPS(write, rwlock);
16698 +#endif
16699
16700  #endif
16701
16702 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
16703  EXPORT_SYMBOL(_raw_spin_unlock_bh);
16704  #endif
16705
16706 +#ifndef CONFIG_PREEMPT_RT_FULL
16707 +
16708  #ifndef CONFIG_INLINE_READ_TRYLOCK
16709  int __lockfunc _raw_read_trylock(rwlock_t *lock)
16710  {
16711 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
16712  EXPORT_SYMBOL(_raw_write_unlock_bh);
16713  #endif
16714
16715 +#endif /* !PREEMPT_RT_FULL */
16716 +
16717  #ifdef CONFIG_DEBUG_LOCK_ALLOC
16718
16719  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
16720 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
16721 index 9aa0fccd5d43..76d0b40d9193 100644
16722 --- a/kernel/locking/spinlock_debug.c
16723 +++ b/kernel/locking/spinlock_debug.c
16724 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
16725
16726  EXPORT_SYMBOL(__raw_spin_lock_init);
16727
16728 +#ifndef CONFIG_PREEMPT_RT_FULL
16729  void __rwlock_init(rwlock_t *lock, const char *name,
16730                    struct lock_class_key *key)
16731  {
16732 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
16733  }
16734
16735  EXPORT_SYMBOL(__rwlock_init);
16736 +#endif
16737
16738  static void spin_dump(raw_spinlock_t *lock, const char *msg)
16739  {
16740 @@ -135,6 +137,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
16741         arch_spin_unlock(&lock->raw_lock);
16742  }
16743
16744 +#ifndef CONFIG_PREEMPT_RT_FULL
16745  static void rwlock_bug(rwlock_t *lock, const char *msg)
16746  {
16747         if (!debug_locks_off())
16748 @@ -224,3 +227,5 @@ void do_raw_write_unlock(rwlock_t *lock)
16749         debug_write_unlock(lock);
16750         arch_write_unlock(&lock->raw_lock);
16751  }
16752 +
16753 +#endif
16754 diff --git a/kernel/module.c b/kernel/module.c
16755 index 0e54d5bf0097..f27764fbfa24 100644
16756 --- a/kernel/module.c
16757 +++ b/kernel/module.c
16758 @@ -660,16 +660,7 @@ static void percpu_modcopy(struct module *mod,
16759                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
16760  }
16761
16762 -/**
16763 - * is_module_percpu_address - test whether address is from module static percpu
16764 - * @addr: address to test
16765 - *
16766 - * Test whether @addr belongs to module static percpu area.
16767 - *
16768 - * RETURNS:
16769 - * %true if @addr is from module static percpu area
16770 - */
16771 -bool is_module_percpu_address(unsigned long addr)
16772 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
16773  {
16774         struct module *mod;
16775         unsigned int cpu;
16776 @@ -683,9 +674,15 @@ bool is_module_percpu_address(unsigned long addr)
16777                         continue;
16778                 for_each_possible_cpu(cpu) {
16779                         void *start = per_cpu_ptr(mod->percpu, cpu);
16780 -
16781 -                       if ((void *)addr >= start &&
16782 -                           (void *)addr < start + mod->percpu_size) {
16783 +                       void *va = (void *)addr;
16784 +
16785 +                       if (va >= start && va < start + mod->percpu_size) {
16786 +                               if (can_addr) {
16787 +                                       *can_addr = (unsigned long) (va - start);
16788 +                                       *can_addr += (unsigned long)
16789 +                                               per_cpu_ptr(mod->percpu,
16790 +                                                           get_boot_cpu_id());
16791 +                               }
16792                                 preempt_enable();
16793                                 return true;
16794                         }
16795 @@ -696,6 +693,20 @@ bool is_module_percpu_address(unsigned long addr)
16796         return false;
16797  }
16798
16799 +/**
16800 + * is_module_percpu_address - test whether address is from module static percpu
16801 + * @addr: address to test
16802 + *
16803 + * Test whether @addr belongs to module static percpu area.
16804 + *
16805 + * RETURNS:
16806 + * %true if @addr is from module static percpu area
16807 + */
16808 +bool is_module_percpu_address(unsigned long addr)
16809 +{
16810 +       return __is_module_percpu_address(addr, NULL);
16811 +}
16812 +
16813  #else /* ... !CONFIG_SMP */
16814
16815  static inline void __percpu *mod_percpu(struct module *mod)
16816 @@ -727,6 +738,11 @@ bool is_module_percpu_address(unsigned long addr)
16817         return false;
16818  }
16819
16820 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
16821 +{
16822 +       return false;
16823 +}
16824 +
16825  #endif /* CONFIG_SMP */
16826
16827  #define MODINFO_ATTR(field)    \
16828 diff --git a/kernel/panic.c b/kernel/panic.c
16829 index dbec387099b1..b67a4803ff2b 100644
16830 --- a/kernel/panic.c
16831 +++ b/kernel/panic.c
16832 @@ -482,9 +482,11 @@ static u64 oops_id;
16833
16834  static int init_oops_id(void)
16835  {
16836 +#ifndef CONFIG_PREEMPT_RT_FULL
16837         if (!oops_id)
16838                 get_random_bytes(&oops_id, sizeof(oops_id));
16839         else
16840 +#endif
16841                 oops_id++;
16842
16843         return 0;
16844 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
16845 index b26dbc48c75b..968255f27a33 100644
16846 --- a/kernel/power/hibernate.c
16847 +++ b/kernel/power/hibernate.c
16848 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
16849
16850         local_irq_disable();
16851
16852 +       system_state = SYSTEM_SUSPEND;
16853 +
16854         error = syscore_suspend();
16855         if (error) {
16856                 printk(KERN_ERR "PM: Some system devices failed to power down, "
16857 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
16858         syscore_resume();
16859
16860   Enable_irqs:
16861 +       system_state = SYSTEM_RUNNING;
16862         local_irq_enable();
16863
16864   Enable_cpus:
16865 @@ -446,6 +449,7 @@ static int resume_target_kernel(bool platform_mode)
16866                 goto Enable_cpus;
16867
16868         local_irq_disable();
16869 +       system_state = SYSTEM_SUSPEND;
16870
16871         error = syscore_suspend();
16872         if (error)
16873 @@ -479,6 +483,7 @@ static int resume_target_kernel(bool platform_mode)
16874         syscore_resume();
16875
16876   Enable_irqs:
16877 +       system_state = SYSTEM_RUNNING;
16878         local_irq_enable();
16879
16880   Enable_cpus:
16881 @@ -564,6 +569,7 @@ int hibernation_platform_enter(void)
16882                 goto Enable_cpus;
16883
16884         local_irq_disable();
16885 +       system_state = SYSTEM_SUSPEND;
16886         syscore_suspend();
16887         if (pm_wakeup_pending()) {
16888                 error = -EAGAIN;
16889 @@ -576,6 +582,7 @@ int hibernation_platform_enter(void)
16890
16891   Power_up:
16892         syscore_resume();
16893 +       system_state = SYSTEM_RUNNING;
16894         local_irq_enable();
16895
16896   Enable_cpus:
16897 @@ -676,6 +683,10 @@ static int load_image_and_restore(void)
16898         return error;
16899  }
16900
16901 +#ifndef CONFIG_SUSPEND
16902 +bool pm_in_action;
16903 +#endif
16904 +
16905  /**
16906   * hibernate - Carry out system hibernation, including saving the image.
16907   */
16908 @@ -689,6 +700,8 @@ int hibernate(void)
16909                 return -EPERM;
16910         }
16911
16912 +       pm_in_action = true;
16913 +
16914         lock_system_sleep();
16915         /* The snapshot device should not be opened while we're running */
16916         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
16917 @@ -766,6 +779,7 @@ int hibernate(void)
16918         atomic_inc(&snapshot_device_available);
16919   Unlock:
16920         unlock_system_sleep();
16921 +       pm_in_action = false;
16922         return error;
16923  }
16924
16925 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
16926 index 6ccb08f57fcb..c8cbb5ed2fe3 100644
16927 --- a/kernel/power/suspend.c
16928 +++ b/kernel/power/suspend.c
16929 @@ -369,6 +369,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
16930         arch_suspend_disable_irqs();
16931         BUG_ON(!irqs_disabled());
16932
16933 +       system_state = SYSTEM_SUSPEND;
16934 +
16935         error = syscore_suspend();
16936         if (!error) {
16937                 *wakeup = pm_wakeup_pending();
16938 @@ -385,6 +387,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
16939                 syscore_resume();
16940         }
16941
16942 +       system_state = SYSTEM_RUNNING;
16943 +
16944         arch_suspend_enable_irqs();
16945         BUG_ON(irqs_disabled());
16946
16947 @@ -527,6 +531,8 @@ static int enter_state(suspend_state_t state)
16948         return error;
16949  }
16950
16951 +bool pm_in_action;
16952 +
16953  /**
16954   * pm_suspend - Externally visible function for suspending the system.
16955   * @state: System sleep state to enter.
16956 @@ -541,6 +547,8 @@ int pm_suspend(suspend_state_t state)
16957         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
16958                 return -EINVAL;
16959
16960 +       pm_in_action = true;
16961 +
16962         error = enter_state(state);
16963         if (error) {
16964                 suspend_stats.fail++;
16965 @@ -548,6 +556,7 @@ int pm_suspend(suspend_state_t state)
16966         } else {
16967                 suspend_stats.success++;
16968         }
16969 +       pm_in_action = false;
16970         return error;
16971  }
16972  EXPORT_SYMBOL(pm_suspend);
16973 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
16974 index 9c5b231684d0..cf15bdb6855b 100644
16975 --- a/kernel/printk/printk.c
16976 +++ b/kernel/printk/printk.c
16977 @@ -351,6 +351,65 @@ __packed __aligned(4)
16978   */
16979  DEFINE_RAW_SPINLOCK(logbuf_lock);
16980
16981 +#ifdef CONFIG_EARLY_PRINTK
16982 +struct console *early_console;
16983 +
16984 +static void early_vprintk(const char *fmt, va_list ap)
16985 +{
16986 +       if (early_console) {
16987 +               char buf[512];
16988 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
16989 +
16990 +               early_console->write(early_console, buf, n);
16991 +       }
16992 +}
16993 +
16994 +asmlinkage void early_printk(const char *fmt, ...)
16995 +{
16996 +       va_list ap;
16997 +
16998 +       va_start(ap, fmt);
16999 +       early_vprintk(fmt, ap);
17000 +       va_end(ap);
17001 +}
17002 +
17003 +/*
17004 + * This is independent of any log levels - a global
17005 + * kill switch that turns off all of printk.
17006 + *
17007 + * Used by the NMI watchdog if early-printk is enabled.
17008 + */
17009 +static bool __read_mostly printk_killswitch;
17010 +
17011 +static int __init force_early_printk_setup(char *str)
17012 +{
17013 +       printk_killswitch = true;
17014 +       return 0;
17015 +}
17016 +early_param("force_early_printk", force_early_printk_setup);
17017 +
17018 +void printk_kill(void)
17019 +{
17020 +       printk_killswitch = true;
17021 +}
17022 +
17023 +#ifdef CONFIG_PRINTK
17024 +static int forced_early_printk(const char *fmt, va_list ap)
17025 +{
17026 +       if (!printk_killswitch)
17027 +               return 0;
17028 +       early_vprintk(fmt, ap);
17029 +       return 1;
17030 +}
17031 +#endif
17032 +
17033 +#else
17034 +static inline int forced_early_printk(const char *fmt, va_list ap)
17035 +{
17036 +       return 0;
17037 +}
17038 +#endif
17039 +
17040  #ifdef CONFIG_PRINTK
17041  DECLARE_WAIT_QUEUE_HEAD(log_wait);
17042  /* the next printk record to read by syslog(READ) or /proc/kmsg */
17043 @@ -1337,6 +1396,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17044  {
17045         char *text;
17046         int len = 0;
17047 +       int attempts = 0;
17048
17049         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
17050         if (!text)
17051 @@ -1348,6 +1408,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17052                 u64 seq;
17053                 u32 idx;
17054                 enum log_flags prev;
17055 +               int num_msg;
17056 +try_again:
17057 +               attempts++;
17058 +               if (attempts > 10) {
17059 +                       len = -EBUSY;
17060 +                       goto out;
17061 +               }
17062 +               num_msg = 0;
17063
17064                 /*
17065                  * Find first record that fits, including all following records,
17066 @@ -1363,6 +1431,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17067                         prev = msg->flags;
17068                         idx = log_next(idx);
17069                         seq++;
17070 +                       num_msg++;
17071 +                       if (num_msg > 5) {
17072 +                               num_msg = 0;
17073 +                               raw_spin_unlock_irq(&logbuf_lock);
17074 +                               raw_spin_lock_irq(&logbuf_lock);
17075 +                               if (clear_seq < log_first_seq)
17076 +                                       goto try_again;
17077 +                       }
17078                 }
17079
17080                 /* move first record forward until length fits into the buffer */
17081 @@ -1376,6 +1452,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17082                         prev = msg->flags;
17083                         idx = log_next(idx);
17084                         seq++;
17085 +                       num_msg++;
17086 +                       if (num_msg > 5) {
17087 +                               num_msg = 0;
17088 +                               raw_spin_unlock_irq(&logbuf_lock);
17089 +                               raw_spin_lock_irq(&logbuf_lock);
17090 +                               if (clear_seq < log_first_seq)
17091 +                                       goto try_again;
17092 +                       }
17093                 }
17094
17095                 /* last message fitting into this dump */
17096 @@ -1416,6 +1500,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
17097                 clear_seq = log_next_seq;
17098                 clear_idx = log_next_idx;
17099         }
17100 +out:
17101         raw_spin_unlock_irq(&logbuf_lock);
17102
17103         kfree(text);
17104 @@ -1569,6 +1654,12 @@ static void call_console_drivers(int level,
17105         if (!console_drivers)
17106                 return;
17107
17108 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
17109 +               if (in_irq() || in_nmi())
17110 +                       return;
17111 +       }
17112 +
17113 +       migrate_disable();
17114         for_each_console(con) {
17115                 if (exclusive_console && con != exclusive_console)
17116                         continue;
17117 @@ -1584,6 +1675,7 @@ static void call_console_drivers(int level,
17118                 else
17119                         con->write(con, text, len);
17120         }
17121 +       migrate_enable();
17122  }
17123
17124  /*
17125 @@ -1781,6 +1873,13 @@ asmlinkage int vprintk_emit(int facility, int level,
17126         /* cpu currently holding logbuf_lock in this function */
17127         static unsigned int logbuf_cpu = UINT_MAX;
17128
17129 +       /*
17130 +        * Fall back to early_printk if a debugging subsystem has
17131 +        * killed printk output
17132 +        */
17133 +       if (unlikely(forced_early_printk(fmt, args)))
17134 +               return 1;
17135 +
17136         if (level == LOGLEVEL_SCHED) {
17137                 level = LOGLEVEL_DEFAULT;
17138                 in_sched = true;
17139 @@ -1885,13 +1984,23 @@ asmlinkage int vprintk_emit(int facility, int level,
17140
17141         /* If called from the scheduler, we can not call up(). */
17142         if (!in_sched) {
17143 +               int may_trylock = 1;
17144 +
17145                 lockdep_off();
17146 +#ifdef CONFIG_PREEMPT_RT_FULL
17147 +               /*
17148 +                * we can't take a sleeping lock with IRQs or preeption disabled
17149 +                * so we can't print in these contexts
17150 +                */
17151 +               if (!(preempt_count() == 0 && !irqs_disabled()))
17152 +                       may_trylock = 0;
17153 +#endif
17154                 /*
17155                  * Try to acquire and then immediately release the console
17156                  * semaphore.  The release will print out buffers and wake up
17157                  * /dev/kmsg and syslog() users.
17158                  */
17159 -               if (console_trylock())
17160 +               if (may_trylock && console_trylock())
17161                         console_unlock();
17162                 lockdep_on();
17163         }
17164 @@ -2014,26 +2123,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
17165
17166  #endif /* CONFIG_PRINTK */
17167
17168 -#ifdef CONFIG_EARLY_PRINTK
17169 -struct console *early_console;
17170 -
17171 -asmlinkage __visible void early_printk(const char *fmt, ...)
17172 -{
17173 -       va_list ap;
17174 -       char buf[512];
17175 -       int n;
17176 -
17177 -       if (!early_console)
17178 -               return;
17179 -
17180 -       va_start(ap, fmt);
17181 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
17182 -       va_end(ap);
17183 -
17184 -       early_console->write(early_console, buf, n);
17185 -}
17186 -#endif
17187 -
17188  static int __add_preferred_console(char *name, int idx, char *options,
17189                                    char *brl_options)
17190  {
17191 @@ -2303,11 +2392,16 @@ static void console_cont_flush(char *text, size_t size)
17192                 goto out;
17193
17194         len = cont_print_text(text, size);
17195 +#ifdef CONFIG_PREEMPT_RT_FULL
17196 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
17197 +       call_console_drivers(cont.level, NULL, 0, text, len);
17198 +#else
17199         raw_spin_unlock(&logbuf_lock);
17200         stop_critical_timings();
17201         call_console_drivers(cont.level, NULL, 0, text, len);
17202         start_critical_timings();
17203         local_irq_restore(flags);
17204 +#endif
17205         return;
17206  out:
17207         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
17208 @@ -2431,13 +2525,17 @@ void console_unlock(void)
17209                 console_idx = log_next(console_idx);
17210                 console_seq++;
17211                 console_prev = msg->flags;
17212 +#ifdef CONFIG_PREEMPT_RT_FULL
17213 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
17214 +               call_console_drivers(level, ext_text, ext_len, text, len);
17215 +#else
17216                 raw_spin_unlock(&logbuf_lock);
17217
17218                 stop_critical_timings();        /* don't trace print latency */
17219                 call_console_drivers(level, ext_text, ext_len, text, len);
17220                 start_critical_timings();
17221                 local_irq_restore(flags);
17222 -
17223 +#endif
17224                 if (do_cond_resched)
17225                         cond_resched();
17226         }
17227 @@ -2489,6 +2587,11 @@ void console_unblank(void)
17228  {
17229         struct console *c;
17230
17231 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
17232 +               if (in_irq() || in_nmi())
17233 +                       return;
17234 +       }
17235 +
17236         /*
17237          * console_unblank can no longer be called in interrupt context unless
17238          * oops_in_progress is set to 1..
17239 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
17240 index f39a7be98fc1..583ce3aad891 100644
17241 --- a/kernel/ptrace.c
17242 +++ b/kernel/ptrace.c
17243 @@ -172,7 +172,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
17244
17245         spin_lock_irq(&task->sighand->siglock);
17246         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
17247 -               task->state = __TASK_TRACED;
17248 +               unsigned long flags;
17249 +
17250 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
17251 +               if (task->state & __TASK_TRACED)
17252 +                       task->state = __TASK_TRACED;
17253 +               else
17254 +                       task->saved_state = __TASK_TRACED;
17255 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
17256                 ret = true;
17257         }
17258         spin_unlock_irq(&task->sighand->siglock);
17259 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
17260 index bf08fee53dc7..eeb8ce4ad7b6 100644
17261 --- a/kernel/rcu/rcutorture.c
17262 +++ b/kernel/rcu/rcutorture.c
17263 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
17264         .name           = "rcu"
17265  };
17266
17267 +#ifndef CONFIG_PREEMPT_RT_FULL
17268  /*
17269   * Definitions for rcu_bh torture testing.
17270   */
17271 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
17272         .name           = "rcu_bh"
17273  };
17274
17275 +#else
17276 +static struct rcu_torture_ops rcu_bh_ops = {
17277 +       .ttype          = INVALID_RCU_FLAVOR,
17278 +};
17279 +#endif
17280 +
17281  /*
17282   * Don't even think about trying any of these in real life!!!
17283   * The names includes "busted", and they really means it!
17284 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
17285 index d1a02877a42c..a7b11a29e03a 100644
17286 --- a/kernel/rcu/tree.c
17287 +++ b/kernel/rcu/tree.c
17288 @@ -55,6 +55,11 @@
17289  #include <linux/random.h>
17290  #include <linux/trace_events.h>
17291  #include <linux/suspend.h>
17292 +#include <linux/delay.h>
17293 +#include <linux/gfp.h>
17294 +#include <linux/oom.h>
17295 +#include <linux/smpboot.h>
17296 +#include "../time/tick-internal.h"
17297
17298  #include "tree.h"
17299  #include "rcu.h"
17300 @@ -260,6 +265,19 @@ void rcu_sched_qs(void)
17301                            this_cpu_ptr(&rcu_sched_data), true);
17302  }
17303
17304 +#ifdef CONFIG_PREEMPT_RT_FULL
17305 +static void rcu_preempt_qs(void);
17306 +
17307 +void rcu_bh_qs(void)
17308 +{
17309 +       unsigned long flags;
17310 +
17311 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
17312 +       local_irq_save(flags);
17313 +       rcu_preempt_qs();
17314 +       local_irq_restore(flags);
17315 +}
17316 +#else
17317  void rcu_bh_qs(void)
17318  {
17319         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
17320 @@ -269,6 +287,7 @@ void rcu_bh_qs(void)
17321                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
17322         }
17323  }
17324 +#endif
17325
17326  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
17327
17328 @@ -449,11 +468,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
17329  /*
17330   * Return the number of RCU BH batches started thus far for debug & stats.
17331   */
17332 +#ifndef CONFIG_PREEMPT_RT_FULL
17333  unsigned long rcu_batches_started_bh(void)
17334  {
17335         return rcu_bh_state.gpnum;
17336  }
17337  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
17338 +#endif
17339
17340  /*
17341   * Return the number of RCU batches completed thus far for debug & stats.
17342 @@ -473,6 +494,7 @@ unsigned long rcu_batches_completed_sched(void)
17343  }
17344  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
17345
17346 +#ifndef CONFIG_PREEMPT_RT_FULL
17347  /*
17348   * Return the number of RCU BH batches completed thus far for debug & stats.
17349   */
17350 @@ -481,6 +503,7 @@ unsigned long rcu_batches_completed_bh(void)
17351         return rcu_bh_state.completed;
17352  }
17353  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
17354 +#endif
17355
17356  /*
17357   * Return the number of RCU expedited batches completed thus far for
17358 @@ -504,6 +527,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
17359  }
17360  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
17361
17362 +#ifndef CONFIG_PREEMPT_RT_FULL
17363  /*
17364   * Force a quiescent state.
17365   */
17366 @@ -522,6 +546,13 @@ void rcu_bh_force_quiescent_state(void)
17367  }
17368  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
17369
17370 +#else
17371 +void rcu_force_quiescent_state(void)
17372 +{
17373 +}
17374 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
17375 +#endif
17376 +
17377  /*
17378   * Force a quiescent state for RCU-sched.
17379   */
17380 @@ -572,9 +603,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
17381         case RCU_FLAVOR:
17382                 rsp = rcu_state_p;
17383                 break;
17384 +#ifndef CONFIG_PREEMPT_RT_FULL
17385         case RCU_BH_FLAVOR:
17386                 rsp = &rcu_bh_state;
17387                 break;
17388 +#endif
17389         case RCU_SCHED_FLAVOR:
17390                 rsp = &rcu_sched_state;
17391                 break;
17392 @@ -3026,18 +3059,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
17393  /*
17394   * Do RCU core processing for the current CPU.
17395   */
17396 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
17397 +static __latent_entropy void rcu_process_callbacks(void)
17398  {
17399         struct rcu_state *rsp;
17400
17401         if (cpu_is_offline(smp_processor_id()))
17402                 return;
17403 -       trace_rcu_utilization(TPS("Start RCU core"));
17404         for_each_rcu_flavor(rsp)
17405                 __rcu_process_callbacks(rsp);
17406 -       trace_rcu_utilization(TPS("End RCU core"));
17407  }
17408
17409 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
17410  /*
17411   * Schedule RCU callback invocation.  If the specified type of RCU
17412   * does not support RCU priority boosting, just do a direct call,
17413 @@ -3049,20 +3081,107 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
17414  {
17415         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
17416                 return;
17417 -       if (likely(!rsp->boost)) {
17418 -               rcu_do_batch(rsp, rdp);
17419 -               return;
17420 -       }
17421 -       invoke_rcu_callbacks_kthread();
17422 +       rcu_do_batch(rsp, rdp);
17423 +}
17424 +
17425 +static void rcu_wake_cond(struct task_struct *t, int status)
17426 +{
17427 +       /*
17428 +        * If the thread is yielding, only wake it when this
17429 +        * is invoked from idle
17430 +        */
17431 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
17432 +               wake_up_process(t);
17433  }
17434
17435 +/*
17436 + * Wake up this CPU's rcuc kthread to do RCU core processing.
17437 + */
17438  static void invoke_rcu_core(void)
17439  {
17440 -       if (cpu_online(smp_processor_id()))
17441 -               raise_softirq(RCU_SOFTIRQ);
17442 +       unsigned long flags;
17443 +       struct task_struct *t;
17444 +
17445 +       if (!cpu_online(smp_processor_id()))
17446 +               return;
17447 +       local_irq_save(flags);
17448 +       __this_cpu_write(rcu_cpu_has_work, 1);
17449 +       t = __this_cpu_read(rcu_cpu_kthread_task);
17450 +       if (t != NULL && current != t)
17451 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
17452 +       local_irq_restore(flags);
17453 +}
17454 +
17455 +static void rcu_cpu_kthread_park(unsigned int cpu)
17456 +{
17457 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
17458 +}
17459 +
17460 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
17461 +{
17462 +       return __this_cpu_read(rcu_cpu_has_work);
17463  }
17464
17465  /*
17466 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
17467 + * RCU softirq used in flavors and configurations of RCU that do not
17468 + * support RCU priority boosting.
17469 + */
17470 +static void rcu_cpu_kthread(unsigned int cpu)
17471 +{
17472 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
17473 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
17474 +       int spincnt;
17475 +
17476 +       for (spincnt = 0; spincnt < 10; spincnt++) {
17477 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
17478 +               local_bh_disable();
17479 +               *statusp = RCU_KTHREAD_RUNNING;
17480 +               this_cpu_inc(rcu_cpu_kthread_loops);
17481 +               local_irq_disable();
17482 +               work = *workp;
17483 +               *workp = 0;
17484 +               local_irq_enable();
17485 +               if (work)
17486 +                       rcu_process_callbacks();
17487 +               local_bh_enable();
17488 +               if (*workp == 0) {
17489 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
17490 +                       *statusp = RCU_KTHREAD_WAITING;
17491 +                       return;
17492 +               }
17493 +       }
17494 +       *statusp = RCU_KTHREAD_YIELDING;
17495 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
17496 +       schedule_timeout_interruptible(2);
17497 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
17498 +       *statusp = RCU_KTHREAD_WAITING;
17499 +}
17500 +
17501 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
17502 +       .store                  = &rcu_cpu_kthread_task,
17503 +       .thread_should_run      = rcu_cpu_kthread_should_run,
17504 +       .thread_fn              = rcu_cpu_kthread,
17505 +       .thread_comm            = "rcuc/%u",
17506 +       .setup                  = rcu_cpu_kthread_setup,
17507 +       .park                   = rcu_cpu_kthread_park,
17508 +};
17509 +
17510 +/*
17511 + * Spawn per-CPU RCU core processing kthreads.
17512 + */
17513 +static int __init rcu_spawn_core_kthreads(void)
17514 +{
17515 +       int cpu;
17516 +
17517 +       for_each_possible_cpu(cpu)
17518 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
17519 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
17520 +       return 0;
17521 +}
17522 +early_initcall(rcu_spawn_core_kthreads);
17523 +
17524 +/*
17525   * Handle any core-RCU processing required by a call_rcu() invocation.
17526   */
17527  static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
17528 @@ -3205,6 +3324,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
17529  }
17530  EXPORT_SYMBOL_GPL(call_rcu_sched);
17531
17532 +#ifndef CONFIG_PREEMPT_RT_FULL
17533  /*
17534   * Queue an RCU callback for invocation after a quicker grace period.
17535   */
17536 @@ -3213,6 +3333,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
17537         __call_rcu(head, func, &rcu_bh_state, -1, 0);
17538  }
17539  EXPORT_SYMBOL_GPL(call_rcu_bh);
17540 +#endif
17541
17542  /*
17543   * Queue an RCU callback for lazy invocation after a grace period.
17544 @@ -3304,6 +3425,7 @@ void synchronize_sched(void)
17545  }
17546  EXPORT_SYMBOL_GPL(synchronize_sched);
17547
17548 +#ifndef CONFIG_PREEMPT_RT_FULL
17549  /**
17550   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
17551   *
17552 @@ -3330,6 +3452,7 @@ void synchronize_rcu_bh(void)
17553                 wait_rcu_gp(call_rcu_bh);
17554  }
17555  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
17556 +#endif
17557
17558  /**
17559   * get_state_synchronize_rcu - Snapshot current RCU state
17560 @@ -3708,6 +3831,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
17561         mutex_unlock(&rsp->barrier_mutex);
17562  }
17563
17564 +#ifndef CONFIG_PREEMPT_RT_FULL
17565  /**
17566   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
17567   */
17568 @@ -3716,6 +3840,7 @@ void rcu_barrier_bh(void)
17569         _rcu_barrier(&rcu_bh_state);
17570  }
17571  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
17572 +#endif
17573
17574  /**
17575   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
17576 @@ -4237,12 +4362,13 @@ void __init rcu_init(void)
17577
17578         rcu_bootup_announce();
17579         rcu_init_geometry();
17580 +#ifndef CONFIG_PREEMPT_RT_FULL
17581         rcu_init_one(&rcu_bh_state);
17582 +#endif
17583         rcu_init_one(&rcu_sched_state);
17584         if (dump_tree)
17585                 rcu_dump_rcu_node_tree(&rcu_sched_state);
17586         __rcu_init_preempt();
17587 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
17588
17589         /*
17590          * We don't need protection against CPU-hotplug here because
17591 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
17592 index e99a5234d9ed..958ac107062c 100644
17593 --- a/kernel/rcu/tree.h
17594 +++ b/kernel/rcu/tree.h
17595 @@ -588,18 +588,18 @@ extern struct list_head rcu_struct_flavors;
17596   */
17597  extern struct rcu_state rcu_sched_state;
17598
17599 +#ifndef CONFIG_PREEMPT_RT_FULL
17600  extern struct rcu_state rcu_bh_state;
17601 +#endif
17602
17603  #ifdef CONFIG_PREEMPT_RCU
17604  extern struct rcu_state rcu_preempt_state;
17605  #endif /* #ifdef CONFIG_PREEMPT_RCU */
17606
17607 -#ifdef CONFIG_RCU_BOOST
17608  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17609  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
17610  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17611  DECLARE_PER_CPU(char, rcu_cpu_has_work);
17612 -#endif /* #ifdef CONFIG_RCU_BOOST */
17613
17614  #ifndef RCU_TREE_NONCORE
17615
17616 @@ -619,10 +619,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
17617  static void __init __rcu_init_preempt(void);
17618  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
17619  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
17620 -static void invoke_rcu_callbacks_kthread(void);
17621  static bool rcu_is_callbacks_kthread(void);
17622 +static void rcu_cpu_kthread_setup(unsigned int cpu);
17623  #ifdef CONFIG_RCU_BOOST
17624 -static void rcu_preempt_do_callbacks(void);
17625  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
17626                                                  struct rcu_node *rnp);
17627  #endif /* #ifdef CONFIG_RCU_BOOST */
17628 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
17629 index e3944c4b072d..be12d1aac840 100644
17630 --- a/kernel/rcu/tree_plugin.h
17631 +++ b/kernel/rcu/tree_plugin.h
17632 @@ -24,25 +24,10 @@
17633   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
17634   */
17635
17636 -#include <linux/delay.h>
17637 -#include <linux/gfp.h>
17638 -#include <linux/oom.h>
17639 -#include <linux/smpboot.h>
17640 -#include "../time/tick-internal.h"
17641 -
17642  #ifdef CONFIG_RCU_BOOST
17643
17644  #include "../locking/rtmutex_common.h"
17645
17646 -/*
17647 - * Control variables for per-CPU and per-rcu_node kthreads.  These
17648 - * handle all flavors of RCU.
17649 - */
17650 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
17651 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17652 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17653 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
17654 -
17655  #else /* #ifdef CONFIG_RCU_BOOST */
17656
17657  /*
17658 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
17659
17660  #endif /* #else #ifdef CONFIG_RCU_BOOST */
17661
17662 +/*
17663 + * Control variables for per-CPU and per-rcu_node kthreads.  These
17664 + * handle all flavors of RCU.
17665 + */
17666 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17667 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17668 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
17669 +
17670  #ifdef CONFIG_RCU_NOCB_CPU
17671  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
17672  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
17673 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
17674         }
17675
17676         /* Hardware IRQ handlers cannot block, complain if they get here. */
17677 -       if (in_irq() || in_serving_softirq()) {
17678 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
17679                 lockdep_rcu_suspicious(__FILE__, __LINE__,
17680                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
17681                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
17682 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
17683                 t->rcu_read_unlock_special.b.need_qs = true;
17684  }
17685
17686 -#ifdef CONFIG_RCU_BOOST
17687 -
17688 -static void rcu_preempt_do_callbacks(void)
17689 -{
17690 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
17691 -}
17692 -
17693 -#endif /* #ifdef CONFIG_RCU_BOOST */
17694 -
17695  /*
17696   * Queue a preemptible-RCU callback for invocation after a grace period.
17697   */
17698 @@ -829,6 +813,19 @@ void exit_rcu(void)
17699
17700  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
17701
17702 +/*
17703 + * If boosting, set rcuc kthreads to realtime priority.
17704 + */
17705 +static void rcu_cpu_kthread_setup(unsigned int cpu)
17706 +{
17707 +#ifdef CONFIG_RCU_BOOST
17708 +       struct sched_param sp;
17709 +
17710 +       sp.sched_priority = kthread_prio;
17711 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
17712 +#endif /* #ifdef CONFIG_RCU_BOOST */
17713 +}
17714 +
17715  #ifdef CONFIG_RCU_BOOST
17716
17717  #include "../locking/rtmutex_common.h"
17718 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
17719
17720  #endif /* #else #ifdef CONFIG_RCU_TRACE */
17721
17722 -static void rcu_wake_cond(struct task_struct *t, int status)
17723 -{
17724 -       /*
17725 -        * If the thread is yielding, only wake it when this
17726 -        * is invoked from idle
17727 -        */
17728 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
17729 -               wake_up_process(t);
17730 -}
17731 -
17732  /*
17733   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
17734   * or ->boost_tasks, advancing the pointer to the next task in the
17735 @@ -1013,23 +1000,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
17736  }
17737
17738  /*
17739 - * Wake up the per-CPU kthread to invoke RCU callbacks.
17740 - */
17741 -static void invoke_rcu_callbacks_kthread(void)
17742 -{
17743 -       unsigned long flags;
17744 -
17745 -       local_irq_save(flags);
17746 -       __this_cpu_write(rcu_cpu_has_work, 1);
17747 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
17748 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
17749 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
17750 -                             __this_cpu_read(rcu_cpu_kthread_status));
17751 -       }
17752 -       local_irq_restore(flags);
17753 -}
17754 -
17755 -/*
17756   * Is the current CPU running the RCU-callbacks kthread?
17757   * Caller must have preemption disabled.
17758   */
17759 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
17760         return 0;
17761  }
17762
17763 -static void rcu_kthread_do_work(void)
17764 -{
17765 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
17766 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
17767 -       rcu_preempt_do_callbacks();
17768 -}
17769 -
17770 -static void rcu_cpu_kthread_setup(unsigned int cpu)
17771 -{
17772 -       struct sched_param sp;
17773 -
17774 -       sp.sched_priority = kthread_prio;
17775 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
17776 -}
17777 -
17778 -static void rcu_cpu_kthread_park(unsigned int cpu)
17779 -{
17780 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
17781 -}
17782 -
17783 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
17784 -{
17785 -       return __this_cpu_read(rcu_cpu_has_work);
17786 -}
17787 -
17788 -/*
17789 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
17790 - * RCU softirq used in flavors and configurations of RCU that do not
17791 - * support RCU priority boosting.
17792 - */
17793 -static void rcu_cpu_kthread(unsigned int cpu)
17794 -{
17795 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
17796 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
17797 -       int spincnt;
17798 -
17799 -       for (spincnt = 0; spincnt < 10; spincnt++) {
17800 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
17801 -               local_bh_disable();
17802 -               *statusp = RCU_KTHREAD_RUNNING;
17803 -               this_cpu_inc(rcu_cpu_kthread_loops);
17804 -               local_irq_disable();
17805 -               work = *workp;
17806 -               *workp = 0;
17807 -               local_irq_enable();
17808 -               if (work)
17809 -                       rcu_kthread_do_work();
17810 -               local_bh_enable();
17811 -               if (*workp == 0) {
17812 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
17813 -                       *statusp = RCU_KTHREAD_WAITING;
17814 -                       return;
17815 -               }
17816 -       }
17817 -       *statusp = RCU_KTHREAD_YIELDING;
17818 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
17819 -       schedule_timeout_interruptible(2);
17820 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
17821 -       *statusp = RCU_KTHREAD_WAITING;
17822 -}
17823 -
17824  /*
17825   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
17826   * served by the rcu_node in question.  The CPU hotplug lock is still
17827 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
17828         free_cpumask_var(cm);
17829  }
17830
17831 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
17832 -       .store                  = &rcu_cpu_kthread_task,
17833 -       .thread_should_run      = rcu_cpu_kthread_should_run,
17834 -       .thread_fn              = rcu_cpu_kthread,
17835 -       .thread_comm            = "rcuc/%u",
17836 -       .setup                  = rcu_cpu_kthread_setup,
17837 -       .park                   = rcu_cpu_kthread_park,
17838 -};
17839 -
17840  /*
17841   * Spawn boost kthreads -- called as soon as the scheduler is running.
17842   */
17843  static void __init rcu_spawn_boost_kthreads(void)
17844  {
17845         struct rcu_node *rnp;
17846 -       int cpu;
17847 -
17848 -       for_each_possible_cpu(cpu)
17849 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
17850 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
17851         rcu_for_each_leaf_node(rcu_state_p, rnp)
17852                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
17853  }
17854 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
17855         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
17856  }
17857
17858 -static void invoke_rcu_callbacks_kthread(void)
17859 -{
17860 -       WARN_ON_ONCE(1);
17861 -}
17862 -
17863  static bool rcu_is_callbacks_kthread(void)
17864  {
17865         return false;
17866 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
17867
17868  #endif /* #else #ifdef CONFIG_RCU_BOOST */
17869
17870 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
17871 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
17872
17873  /*
17874   * Check to see if any future RCU-related work will need to be done
17875 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
17876         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
17877                ? 0 : rcu_cpu_has_callbacks(NULL);
17878  }
17879 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
17880
17881 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
17882  /*
17883   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
17884   * after it.
17885 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
17886         return cbs_ready;
17887  }
17888
17889 +#ifndef CONFIG_PREEMPT_RT_FULL
17890 +
17891  /*
17892   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
17893   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
17894 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
17895         *nextevt = basemono + dj * TICK_NSEC;
17896         return 0;
17897  }
17898 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
17899
17900  /*
17901   * Prepare a CPU for idle from an RCU perspective.  The first major task
17902 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
17903 index 4f6db7e6a117..ee02e1e1b3e5 100644
17904 --- a/kernel/rcu/update.c
17905 +++ b/kernel/rcu/update.c
17906 @@ -62,7 +62,7 @@
17907  #ifndef CONFIG_TINY_RCU
17908  module_param(rcu_expedited, int, 0);
17909  module_param(rcu_normal, int, 0);
17910 -static int rcu_normal_after_boot;
17911 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17912  module_param(rcu_normal_after_boot, int, 0);
17913  #endif /* #ifndef CONFIG_TINY_RCU */
17914
17915 @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
17916  }
17917  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
17918
17919 -static atomic_t rcu_expedited_nesting =
17920 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
17921 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
17922
17923  /*
17924   * Should normal grace-period primitives be expedited?  Intended for
17925 @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
17926   */
17927  void rcu_end_inkernel_boot(void)
17928  {
17929 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
17930 -               rcu_unexpedite_gp();
17931 +       rcu_unexpedite_gp();
17932         if (rcu_normal_after_boot)
17933                 WRITE_ONCE(rcu_normal, 1);
17934  }
17935 @@ -298,6 +296,7 @@ int rcu_read_lock_held(void)
17936  }
17937  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
17938
17939 +#ifndef CONFIG_PREEMPT_RT_FULL
17940  /**
17941   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
17942   *
17943 @@ -324,6 +323,7 @@ int rcu_read_lock_bh_held(void)
17944         return in_softirq() || irqs_disabled();
17945  }
17946  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
17947 +#endif
17948
17949  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
17950
17951 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
17952 index 5e59b832ae2b..7337a7f60e3f 100644
17953 --- a/kernel/sched/Makefile
17954 +++ b/kernel/sched/Makefile
17955 @@ -17,7 +17,7 @@ endif
17956
17957  obj-y += core.o loadavg.o clock.o cputime.o
17958  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
17959 -obj-y += wait.o swait.o completion.o idle.o
17960 +obj-y += wait.o swait.o swork.o completion.o idle.o
17961  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17962  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17963  obj-$(CONFIG_SCHEDSTATS) += stats.o
17964 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
17965 index 8d0f35debf35..b62cf6400fe0 100644
17966 --- a/kernel/sched/completion.c
17967 +++ b/kernel/sched/completion.c
17968 @@ -30,10 +30,10 @@ void complete(struct completion *x)
17969  {
17970         unsigned long flags;
17971
17972 -       spin_lock_irqsave(&x->wait.lock, flags);
17973 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17974         x->done++;
17975 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
17976 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17977 +       swake_up_locked(&x->wait);
17978 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17979  }
17980  EXPORT_SYMBOL(complete);
17981
17982 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
17983  {
17984         unsigned long flags;
17985
17986 -       spin_lock_irqsave(&x->wait.lock, flags);
17987 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17988         x->done += UINT_MAX/2;
17989 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
17990 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17991 +       swake_up_all_locked(&x->wait);
17992 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17993  }
17994  EXPORT_SYMBOL(complete_all);
17995
17996 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
17997                    long (*action)(long), long timeout, int state)
17998  {
17999         if (!x->done) {
18000 -               DECLARE_WAITQUEUE(wait, current);
18001 +               DECLARE_SWAITQUEUE(wait);
18002
18003 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
18004 +               __prepare_to_swait(&x->wait, &wait);
18005                 do {
18006                         if (signal_pending_state(state, current)) {
18007                                 timeout = -ERESTARTSYS;
18008                                 break;
18009                         }
18010                         __set_current_state(state);
18011 -                       spin_unlock_irq(&x->wait.lock);
18012 +                       raw_spin_unlock_irq(&x->wait.lock);
18013                         timeout = action(timeout);
18014 -                       spin_lock_irq(&x->wait.lock);
18015 +                       raw_spin_lock_irq(&x->wait.lock);
18016                 } while (!x->done && timeout);
18017 -               __remove_wait_queue(&x->wait, &wait);
18018 +               __finish_swait(&x->wait, &wait);
18019                 if (!x->done)
18020                         return timeout;
18021         }
18022 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
18023  {
18024         might_sleep();
18025
18026 -       spin_lock_irq(&x->wait.lock);
18027 +       raw_spin_lock_irq(&x->wait.lock);
18028         timeout = do_wait_for_common(x, action, timeout, state);
18029 -       spin_unlock_irq(&x->wait.lock);
18030 +       raw_spin_unlock_irq(&x->wait.lock);
18031         return timeout;
18032  }
18033
18034 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
18035         if (!READ_ONCE(x->done))
18036                 return 0;
18037
18038 -       spin_lock_irqsave(&x->wait.lock, flags);
18039 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
18040         if (!x->done)
18041                 ret = 0;
18042         else
18043                 x->done--;
18044 -       spin_unlock_irqrestore(&x->wait.lock, flags);
18045 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
18046         return ret;
18047  }
18048  EXPORT_SYMBOL(try_wait_for_completion);
18049 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
18050          * after it's acquired the lock.
18051          */
18052         smp_rmb();
18053 -       spin_unlock_wait(&x->wait.lock);
18054 +       raw_spin_unlock_wait(&x->wait.lock);
18055         return true;
18056  }
18057  EXPORT_SYMBOL(completion_done);
18058 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
18059 index e5066955cc3a..ed1ebcc2ff3d 100644
18060 --- a/kernel/sched/core.c
18061 +++ b/kernel/sched/core.c
18062 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
18063   * Number of tasks to iterate in a single balance run.
18064   * Limited because this is done with IRQs disabled.
18065   */
18066 +#ifndef CONFIG_PREEMPT_RT_FULL
18067  const_debug unsigned int sysctl_sched_nr_migrate = 32;
18068 +#else
18069 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
18070 +#endif
18071
18072  /*
18073   * period over which we average the RT time consumption, measured
18074 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
18075
18076         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
18077         rq->hrtick_timer.function = hrtick;
18078 +       rq->hrtick_timer.irqsafe = 1;
18079  }
18080  #else  /* CONFIG_SCHED_HRTICK */
18081  static inline void hrtick_clear(struct rq *rq)
18082 @@ -425,9 +430,15 @@ static bool set_nr_if_polling(struct task_struct *p)
18083  #endif
18084  #endif
18085
18086 -void wake_q_add(struct wake_q_head *head, struct task_struct *task)
18087 +void __wake_q_add(struct wake_q_head *head, struct task_struct *task,
18088 +                 bool sleeper)
18089  {
18090 -       struct wake_q_node *node = &task->wake_q;
18091 +       struct wake_q_node *node;
18092 +
18093 +       if (sleeper)
18094 +               node = &task->wake_q_sleeper;
18095 +       else
18096 +               node = &task->wake_q;
18097
18098         /*
18099          * Atomically grab the task, if ->wake_q is !nil already it means
18100 @@ -449,24 +460,33 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
18101         head->lastp = &node->next;
18102  }
18103
18104 -void wake_up_q(struct wake_q_head *head)
18105 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
18106  {
18107         struct wake_q_node *node = head->first;
18108
18109         while (node != WAKE_Q_TAIL) {
18110                 struct task_struct *task;
18111
18112 -               task = container_of(node, struct task_struct, wake_q);
18113 +               if (sleeper)
18114 +                       task = container_of(node, struct task_struct, wake_q_sleeper);
18115 +               else
18116 +                       task = container_of(node, struct task_struct, wake_q);
18117                 BUG_ON(!task);
18118                 /* task can safely be re-inserted now */
18119                 node = node->next;
18120 -               task->wake_q.next = NULL;
18121 +               if (sleeper)
18122 +                       task->wake_q_sleeper.next = NULL;
18123 +               else
18124 +                       task->wake_q.next = NULL;
18125
18126                 /*
18127                  * wake_up_process() implies a wmb() to pair with the queueing
18128                  * in wake_q_add() so as not to miss wakeups.
18129                  */
18130 -               wake_up_process(task);
18131 +               if (sleeper)
18132 +                       wake_up_lock_sleeper(task);
18133 +               else
18134 +                       wake_up_process(task);
18135                 put_task_struct(task);
18136         }
18137  }
18138 @@ -502,6 +522,38 @@ void resched_curr(struct rq *rq)
18139                 trace_sched_wake_idle_without_ipi(cpu);
18140  }
18141
18142 +#ifdef CONFIG_PREEMPT_LAZY
18143 +void resched_curr_lazy(struct rq *rq)
18144 +{
18145 +       struct task_struct *curr = rq->curr;
18146 +       int cpu;
18147 +
18148 +       if (!sched_feat(PREEMPT_LAZY)) {
18149 +               resched_curr(rq);
18150 +               return;
18151 +       }
18152 +
18153 +       lockdep_assert_held(&rq->lock);
18154 +
18155 +       if (test_tsk_need_resched(curr))
18156 +               return;
18157 +
18158 +       if (test_tsk_need_resched_lazy(curr))
18159 +               return;
18160 +
18161 +       set_tsk_need_resched_lazy(curr);
18162 +
18163 +       cpu = cpu_of(rq);
18164 +       if (cpu == smp_processor_id())
18165 +               return;
18166 +
18167 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
18168 +       smp_mb();
18169 +       if (!tsk_is_polling(curr))
18170 +               smp_send_reschedule(cpu);
18171 +}
18172 +#endif
18173 +
18174  void resched_cpu(int cpu)
18175  {
18176         struct rq *rq = cpu_rq(cpu);
18177 @@ -524,11 +576,14 @@ void resched_cpu(int cpu)
18178   */
18179  int get_nohz_timer_target(void)
18180  {
18181 -       int i, cpu = smp_processor_id();
18182 +       int i, cpu;
18183         struct sched_domain *sd;
18184
18185 +       preempt_disable_rt();
18186 +       cpu = smp_processor_id();
18187 +
18188         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
18189 -               return cpu;
18190 +               goto preempt_en_rt;
18191
18192         rcu_read_lock();
18193         for_each_domain(cpu, sd) {
18194 @@ -547,6 +602,8 @@ int get_nohz_timer_target(void)
18195                 cpu = housekeeping_any_cpu();
18196  unlock:
18197         rcu_read_unlock();
18198 +preempt_en_rt:
18199 +       preempt_enable_rt();
18200         return cpu;
18201  }
18202  /*
18203 @@ -1092,7 +1149,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
18204         p->nr_cpus_allowed = cpumask_weight(new_mask);
18205  }
18206
18207 -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
18208 +static void __do_set_cpus_allowed_tail(struct task_struct *p,
18209 +                                      const struct cpumask *new_mask)
18210  {
18211         struct rq *rq = task_rq(p);
18212         bool queued, running;
18213 @@ -1121,6 +1179,98 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
18214                 set_curr_task(rq, p);
18215  }
18216
18217 +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
18218 +{
18219 +       if (__migrate_disabled(p)) {
18220 +               lockdep_assert_held(&p->pi_lock);
18221 +
18222 +               cpumask_copy(&p->cpus_allowed, new_mask);
18223 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
18224 +               p->migrate_disable_update = 1;
18225 +#endif
18226 +               return;
18227 +       }
18228 +       __do_set_cpus_allowed_tail(p, new_mask);
18229 +}
18230 +
18231 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
18232 +static DEFINE_MUTEX(sched_down_mutex);
18233 +static cpumask_t sched_down_cpumask;
18234 +
18235 +void tell_sched_cpu_down_begin(int cpu)
18236 +{
18237 +       mutex_lock(&sched_down_mutex);
18238 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
18239 +       mutex_unlock(&sched_down_mutex);
18240 +}
18241 +
18242 +void tell_sched_cpu_down_done(int cpu)
18243 +{
18244 +       mutex_lock(&sched_down_mutex);
18245 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
18246 +       mutex_unlock(&sched_down_mutex);
18247 +}
18248 +
18249 +/**
18250 + * migrate_me - try to move the current task off this cpu
18251 + *
18252 + * Used by the pin_current_cpu() code to try to get tasks
18253 + * to move off the current CPU as it is going down.
18254 + * It will only move the task if the task isn't pinned to
18255 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
18256 + * and the task has to be in a RUNNING state. Otherwise the
18257 + * movement of the task will wake it up (change its state
18258 + * to running) when the task did not expect it.
18259 + *
18260 + * Returns 1 if it succeeded in moving the current task
18261 + *         0 otherwise.
18262 + */
18263 +int migrate_me(void)
18264 +{
18265 +       struct task_struct *p = current;
18266 +       struct migration_arg arg;
18267 +       struct cpumask *cpumask;
18268 +       struct cpumask *mask;
18269 +       unsigned int dest_cpu;
18270 +       struct rq_flags rf;
18271 +       struct rq *rq;
18272 +
18273 +       /*
18274 +        * We can not migrate tasks bounded to a CPU or tasks not
18275 +        * running. The movement of the task will wake it up.
18276 +        */
18277 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
18278 +               return 0;
18279 +
18280 +       mutex_lock(&sched_down_mutex);
18281 +       rq = task_rq_lock(p, &rf);
18282 +
18283 +       cpumask = this_cpu_ptr(&sched_cpumasks);
18284 +       mask = &p->cpus_allowed;
18285 +
18286 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
18287 +
18288 +       if (!cpumask_weight(cpumask)) {
18289 +               /* It's only on this CPU? */
18290 +               task_rq_unlock(rq, p, &rf);
18291 +               mutex_unlock(&sched_down_mutex);
18292 +               return 0;
18293 +       }
18294 +
18295 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
18296 +
18297 +       arg.task = p;
18298 +       arg.dest_cpu = dest_cpu;
18299 +
18300 +       task_rq_unlock(rq, p, &rf);
18301 +
18302 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
18303 +       tlb_migrate_finish(p->mm);
18304 +       mutex_unlock(&sched_down_mutex);
18305 +
18306 +       return 1;
18307 +}
18308 +
18309  /*
18310   * Change a given task's CPU affinity. Migrate the thread to a
18311   * proper CPU and schedule it away if the CPU it's executing on
18312 @@ -1179,7 +1329,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
18313         }
18314
18315         /* Can the task run on the task's current CPU? If so, we're done */
18316 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
18317 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
18318                 goto out;
18319
18320         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
18321 @@ -1366,6 +1516,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
18322         return ret;
18323  }
18324
18325 +static bool check_task_state(struct task_struct *p, long match_state)
18326 +{
18327 +       bool match = false;
18328 +
18329 +       raw_spin_lock_irq(&p->pi_lock);
18330 +       if (p->state == match_state || p->saved_state == match_state)
18331 +               match = true;
18332 +       raw_spin_unlock_irq(&p->pi_lock);
18333 +
18334 +       return match;
18335 +}
18336 +
18337  /*
18338   * wait_task_inactive - wait for a thread to unschedule.
18339   *
18340 @@ -1410,7 +1572,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
18341                  * is actually now running somewhere else!
18342                  */
18343                 while (task_running(rq, p)) {
18344 -                       if (match_state && unlikely(p->state != match_state))
18345 +                       if (match_state && !check_task_state(p, match_state))
18346                                 return 0;
18347                         cpu_relax();
18348                 }
18349 @@ -1425,7 +1587,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
18350                 running = task_running(rq, p);
18351                 queued = task_on_rq_queued(p);
18352                 ncsw = 0;
18353 -               if (!match_state || p->state == match_state)
18354 +               if (!match_state || p->state == match_state ||
18355 +                   p->saved_state == match_state)
18356                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
18357                 task_rq_unlock(rq, p, &rf);
18358
18359 @@ -1680,10 +1843,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
18360  {
18361         activate_task(rq, p, en_flags);
18362         p->on_rq = TASK_ON_RQ_QUEUED;
18363 -
18364 -       /* if a worker is waking up, notify workqueue */
18365 -       if (p->flags & PF_WQ_WORKER)
18366 -               wq_worker_waking_up(p, cpu_of(rq));
18367  }
18368
18369  /*
18370 @@ -2018,8 +2177,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
18371          */
18372         smp_mb__before_spinlock();
18373         raw_spin_lock_irqsave(&p->pi_lock, flags);
18374 -       if (!(p->state & state))
18375 +       if (!(p->state & state)) {
18376 +               /*
18377 +                * The task might be running due to a spinlock sleeper
18378 +                * wakeup. Check the saved state and set it to running
18379 +                * if the wakeup condition is true.
18380 +                */
18381 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
18382 +                       if (p->saved_state & state) {
18383 +                               p->saved_state = TASK_RUNNING;
18384 +                               success = 1;
18385 +                       }
18386 +               }
18387                 goto out;
18388 +       }
18389 +
18390 +       /*
18391 +        * If this is a regular wakeup, then we can unconditionally
18392 +        * clear the saved state of a "lock sleeper".
18393 +        */
18394 +       if (!(wake_flags & WF_LOCK_SLEEPER))
18395 +               p->saved_state = TASK_RUNNING;
18396
18397         trace_sched_waking(p);
18398
18399 @@ -2102,53 +2280,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
18400  }
18401
18402  /**
18403 - * try_to_wake_up_local - try to wake up a local task with rq lock held
18404 - * @p: the thread to be awakened
18405 - * @cookie: context's cookie for pinning
18406 - *
18407 - * Put @p on the run-queue if it's not already there. The caller must
18408 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
18409 - * the current task.
18410 - */
18411 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
18412 -{
18413 -       struct rq *rq = task_rq(p);
18414 -
18415 -       if (WARN_ON_ONCE(rq != this_rq()) ||
18416 -           WARN_ON_ONCE(p == current))
18417 -               return;
18418 -
18419 -       lockdep_assert_held(&rq->lock);
18420 -
18421 -       if (!raw_spin_trylock(&p->pi_lock)) {
18422 -               /*
18423 -                * This is OK, because current is on_cpu, which avoids it being
18424 -                * picked for load-balance and preemption/IRQs are still
18425 -                * disabled avoiding further scheduler activity on it and we've
18426 -                * not yet picked a replacement task.
18427 -                */
18428 -               lockdep_unpin_lock(&rq->lock, cookie);
18429 -               raw_spin_unlock(&rq->lock);
18430 -               raw_spin_lock(&p->pi_lock);
18431 -               raw_spin_lock(&rq->lock);
18432 -               lockdep_repin_lock(&rq->lock, cookie);
18433 -       }
18434 -
18435 -       if (!(p->state & TASK_NORMAL))
18436 -               goto out;
18437 -
18438 -       trace_sched_waking(p);
18439 -
18440 -       if (!task_on_rq_queued(p))
18441 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
18442 -
18443 -       ttwu_do_wakeup(rq, p, 0, cookie);
18444 -       ttwu_stat(p, smp_processor_id(), 0);
18445 -out:
18446 -       raw_spin_unlock(&p->pi_lock);
18447 -}
18448 -
18449 -/**
18450   * wake_up_process - Wake up a specific process
18451   * @p: The process to be woken up.
18452   *
18453 @@ -2166,6 +2297,18 @@ int wake_up_process(struct task_struct *p)
18454  }
18455  EXPORT_SYMBOL(wake_up_process);
18456
18457 +/**
18458 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
18459 + * @p: The process to be woken up.
18460 + *
18461 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
18462 + * the nature of the wakeup.
18463 + */
18464 +int wake_up_lock_sleeper(struct task_struct *p)
18465 +{
18466 +       return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER);
18467 +}
18468 +
18469  int wake_up_state(struct task_struct *p, unsigned int state)
18470  {
18471         return try_to_wake_up(p, state, 0);
18472 @@ -2442,6 +2585,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
18473         p->on_cpu = 0;
18474  #endif
18475         init_task_preempt_count(p);
18476 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
18477 +       task_thread_info(p)->preempt_lazy_count = 0;
18478 +#endif
18479  #ifdef CONFIG_SMP
18480         plist_node_init(&p->pushable_tasks, MAX_PRIO);
18481         RB_CLEAR_NODE(&p->pushable_dl_tasks);
18482 @@ -2770,21 +2916,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
18483         finish_arch_post_lock_switch();
18484
18485         fire_sched_in_preempt_notifiers(current);
18486 +       /*
18487 +        * We use mmdrop_delayed() here so we don't have to do the
18488 +        * full __mmdrop() when we are the last user.
18489 +        */
18490         if (mm)
18491 -               mmdrop(mm);
18492 +               mmdrop_delayed(mm);
18493         if (unlikely(prev_state == TASK_DEAD)) {
18494                 if (prev->sched_class->task_dead)
18495                         prev->sched_class->task_dead(prev);
18496
18497 -               /*
18498 -                * Remove function-return probe instances associated with this
18499 -                * task and put them back on the free list.
18500 -                */
18501 -               kprobe_flush_task(prev);
18502 -
18503 -               /* Task is done with its stack. */
18504 -               put_task_stack(prev);
18505 -
18506                 put_task_struct(prev);
18507         }
18508
18509 @@ -3252,6 +3393,114 @@ static inline void schedule_debug(struct task_struct *prev)
18510         schedstat_inc(this_rq()->sched_count);
18511  }
18512
18513 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
18514 +
18515 +void migrate_disable(void)
18516 +{
18517 +       struct task_struct *p = current;
18518 +
18519 +       if (in_atomic() || irqs_disabled()) {
18520 +#ifdef CONFIG_SCHED_DEBUG
18521 +               p->migrate_disable_atomic++;
18522 +#endif
18523 +               return;
18524 +       }
18525 +
18526 +#ifdef CONFIG_SCHED_DEBUG
18527 +       if (unlikely(p->migrate_disable_atomic)) {
18528 +               tracing_off();
18529 +               WARN_ON_ONCE(1);
18530 +       }
18531 +#endif
18532 +
18533 +       if (p->migrate_disable) {
18534 +               p->migrate_disable++;
18535 +               return;
18536 +       }
18537 +
18538 +       preempt_disable();
18539 +       preempt_lazy_disable();
18540 +       pin_current_cpu();
18541 +       p->migrate_disable = 1;
18542 +       preempt_enable();
18543 +}
18544 +EXPORT_SYMBOL(migrate_disable);
18545 +
18546 +void migrate_enable(void)
18547 +{
18548 +       struct task_struct *p = current;
18549 +
18550 +       if (in_atomic() || irqs_disabled()) {
18551 +#ifdef CONFIG_SCHED_DEBUG
18552 +               p->migrate_disable_atomic--;
18553 +#endif
18554 +               return;
18555 +       }
18556 +
18557 +#ifdef CONFIG_SCHED_DEBUG
18558 +       if (unlikely(p->migrate_disable_atomic)) {
18559 +               tracing_off();
18560 +               WARN_ON_ONCE(1);
18561 +       }
18562 +#endif
18563 +       WARN_ON_ONCE(p->migrate_disable <= 0);
18564 +
18565 +       if (p->migrate_disable > 1) {
18566 +               p->migrate_disable--;
18567 +               return;
18568 +       }
18569 +
18570 +       preempt_disable();
18571 +       /*
18572 +        * Clearing migrate_disable causes tsk_cpus_allowed to
18573 +        * show the tasks original cpu affinity.
18574 +        */
18575 +       p->migrate_disable = 0;
18576 +
18577 +       if (p->migrate_disable_update) {
18578 +               struct rq *rq;
18579 +               struct rq_flags rf;
18580 +
18581 +               rq = task_rq_lock(p, &rf);
18582 +               update_rq_clock(rq);
18583 +
18584 +               __do_set_cpus_allowed_tail(p, &p->cpus_allowed);
18585 +               task_rq_unlock(rq, p, &rf);
18586 +
18587 +               p->migrate_disable_update = 0;
18588 +
18589 +               WARN_ON(smp_processor_id() != task_cpu(p));
18590 +               if (!cpumask_test_cpu(task_cpu(p), &p->cpus_allowed)) {
18591 +                       const struct cpumask *cpu_valid_mask = cpu_active_mask;
18592 +                       struct migration_arg arg;
18593 +                       unsigned int dest_cpu;
18594 +
18595 +                       if (p->flags & PF_KTHREAD) {
18596 +                               /*
18597 +                                * Kernel threads are allowed on online && !active CPUs
18598 +                                */
18599 +                               cpu_valid_mask = cpu_online_mask;
18600 +                       }
18601 +                       dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_allowed);
18602 +                       arg.task = p;
18603 +                       arg.dest_cpu = dest_cpu;
18604 +
18605 +                       unpin_current_cpu();
18606 +                       preempt_lazy_enable();
18607 +                       preempt_enable();
18608 +                       stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
18609 +                       tlb_migrate_finish(p->mm);
18610 +                       return;
18611 +               }
18612 +       }
18613 +
18614 +       unpin_current_cpu();
18615 +       preempt_enable();
18616 +       preempt_lazy_enable();
18617 +}
18618 +EXPORT_SYMBOL(migrate_enable);
18619 +#endif
18620 +
18621  /*
18622   * Pick up the highest-prio task:
18623   */
18624 @@ -3368,19 +3617,6 @@ static void __sched notrace __schedule(bool preempt)
18625                 } else {
18626                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
18627                         prev->on_rq = 0;
18628 -
18629 -                       /*
18630 -                        * If a worker went to sleep, notify and ask workqueue
18631 -                        * whether it wants to wake up a task to maintain
18632 -                        * concurrency.
18633 -                        */
18634 -                       if (prev->flags & PF_WQ_WORKER) {
18635 -                               struct task_struct *to_wakeup;
18636 -
18637 -                               to_wakeup = wq_worker_sleeping(prev);
18638 -                               if (to_wakeup)
18639 -                                       try_to_wake_up_local(to_wakeup, cookie);
18640 -                       }
18641                 }
18642                 switch_count = &prev->nvcsw;
18643         }
18644 @@ -3390,6 +3626,7 @@ static void __sched notrace __schedule(bool preempt)
18645
18646         next = pick_next_task(rq, prev, cookie);
18647         clear_tsk_need_resched(prev);
18648 +       clear_tsk_need_resched_lazy(prev);
18649         clear_preempt_need_resched();
18650         rq->clock_skip_update = 0;
18651
18652 @@ -3437,9 +3674,20 @@ void __noreturn do_task_dead(void)
18653
18654  static inline void sched_submit_work(struct task_struct *tsk)
18655  {
18656 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
18657 +       if (!tsk->state)
18658                 return;
18659         /*
18660 +        * If a worker went to sleep, notify and ask workqueue whether
18661 +        * it wants to wake up a task to maintain concurrency.
18662 +        */
18663 +       if (tsk->flags & PF_WQ_WORKER)
18664 +               wq_worker_sleeping(tsk);
18665 +
18666 +
18667 +       if (tsk_is_pi_blocked(tsk))
18668 +               return;
18669 +
18670 +       /*
18671          * If we are going to sleep and we have plugged IO queued,
18672          * make sure to submit it to avoid deadlocks.
18673          */
18674 @@ -3447,6 +3695,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
18675                 blk_schedule_flush_plug(tsk);
18676  }
18677
18678 +static void sched_update_worker(struct task_struct *tsk)
18679 +{
18680 +       if (tsk->flags & PF_WQ_WORKER)
18681 +               wq_worker_running(tsk);
18682 +}
18683 +
18684  asmlinkage __visible void __sched schedule(void)
18685  {
18686         struct task_struct *tsk = current;
18687 @@ -3457,6 +3711,7 @@ asmlinkage __visible void __sched schedule(void)
18688                 __schedule(false);
18689                 sched_preempt_enable_no_resched();
18690         } while (need_resched());
18691 +       sched_update_worker(tsk);
18692  }
18693  EXPORT_SYMBOL(schedule);
18694
18695 @@ -3520,6 +3775,30 @@ static void __sched notrace preempt_schedule_common(void)
18696         } while (need_resched());
18697  }
18698
18699 +#ifdef CONFIG_PREEMPT_LAZY
18700 +/*
18701 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
18702 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
18703 + * preempt_lazy_count counter >0.
18704 + */
18705 +static __always_inline int preemptible_lazy(void)
18706 +{
18707 +       if (test_thread_flag(TIF_NEED_RESCHED))
18708 +               return 1;
18709 +       if (current_thread_info()->preempt_lazy_count)
18710 +               return 0;
18711 +       return 1;
18712 +}
18713 +
18714 +#else
18715 +
18716 +static inline int preemptible_lazy(void)
18717 +{
18718 +       return 1;
18719 +}
18720 +
18721 +#endif
18722 +
18723  #ifdef CONFIG_PREEMPT
18724  /*
18725   * this is the entry point to schedule() from in-kernel preemption
18726 @@ -3534,7 +3813,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
18727          */
18728         if (likely(!preemptible()))
18729                 return;
18730 -
18731 +       if (!preemptible_lazy())
18732 +               return;
18733         preempt_schedule_common();
18734  }
18735  NOKPROBE_SYMBOL(preempt_schedule);
18736 @@ -3561,6 +3841,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
18737         if (likely(!preemptible()))
18738                 return;
18739
18740 +       if (!preemptible_lazy())
18741 +               return;
18742 +
18743         do {
18744                 /*
18745                  * Because the function tracer can trace preempt_count_sub()
18746 @@ -3583,7 +3866,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
18747                  * an infinite recursion.
18748                  */
18749                 prev_ctx = exception_enter();
18750 +               /*
18751 +                * The add/subtract must not be traced by the function
18752 +                * tracer. But we still want to account for the
18753 +                * preempt off latency tracer. Since the _notrace versions
18754 +                * of add/subtract skip the accounting for latency tracer
18755 +                * we must force it manually.
18756 +                */
18757 +               start_critical_timings();
18758                 __schedule(true);
18759 +               stop_critical_timings();
18760                 exception_exit(prev_ctx);
18761
18762                 preempt_latency_stop(1);
18763 @@ -3629,10 +3921,25 @@ EXPORT_SYMBOL(default_wake_function);
18764
18765  #ifdef CONFIG_RT_MUTEXES
18766
18767 +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
18768 +{
18769 +       if (pi_task)
18770 +               prio = min(prio, pi_task->prio);
18771 +
18772 +       return prio;
18773 +}
18774 +
18775 +static inline int rt_effective_prio(struct task_struct *p, int prio)
18776 +{
18777 +       struct task_struct *pi_task = rt_mutex_get_top_task(p);
18778 +
18779 +       return __rt_effective_prio(pi_task, prio);
18780 +}
18781 +
18782  /*
18783   * rt_mutex_setprio - set the current priority of a task
18784 - * @p: task
18785 - * @prio: prio value (kernel-internal form)
18786 + * @p: task to boost
18787 + * @pi_task: donor task
18788   *
18789   * This function changes the 'effective' priority of a task. It does
18790   * not touch ->normal_prio like __setscheduler().
18791 @@ -3640,16 +3947,40 @@ EXPORT_SYMBOL(default_wake_function);
18792   * Used by the rt_mutex code to implement priority inheritance
18793   * logic. Call site only calls if the priority of the task changed.
18794   */
18795 -void rt_mutex_setprio(struct task_struct *p, int prio)
18796 +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
18797  {
18798 -       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
18799 +       int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
18800         const struct sched_class *prev_class;
18801         struct rq_flags rf;
18802         struct rq *rq;
18803
18804 -       BUG_ON(prio > MAX_PRIO);
18805 +       /* XXX used to be waiter->prio, not waiter->task->prio */
18806 +       prio = __rt_effective_prio(pi_task, p->normal_prio);
18807 +
18808 +       /*
18809 +        * If nothing changed; bail early.
18810 +        */
18811 +       if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
18812 +               return;
18813
18814         rq = __task_rq_lock(p, &rf);
18815 +       /*
18816 +        * Set under pi_lock && rq->lock, such that the value can be used under
18817 +        * either lock.
18818 +        *
18819 +        * Note that there is loads of tricky to make this pointer cache work
18820 +        * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
18821 +        * ensure a task is de-boosted (pi_task is set to NULL) before the
18822 +        * task is allowed to run again (and can exit). This ensures the pointer
18823 +        * points to a blocked task -- which guaratees the task is present.
18824 +        */
18825 +       p->pi_top_task = pi_task;
18826 +
18827 +       /*
18828 +        * For FIFO/RR we only need to set prio, if that matches we're done.
18829 +        */
18830 +       if (prio == p->prio && !dl_prio(prio))
18831 +               goto out_unlock;
18832
18833         /*
18834          * Idle task boosting is a nono in general. There is one
18835 @@ -3669,7 +4000,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18836                 goto out_unlock;
18837         }
18838
18839 -       trace_sched_pi_setprio(p, prio);
18840 +       trace_sched_pi_setprio(p, pi_task);
18841         oldprio = p->prio;
18842
18843         if (oldprio == prio)
18844 @@ -3693,7 +4024,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18845          *          running task
18846          */
18847         if (dl_prio(prio)) {
18848 -               struct task_struct *pi_task = rt_mutex_get_top_task(p);
18849                 if (!dl_prio(p->normal_prio) ||
18850                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
18851                         p->dl.dl_boosted = 1;
18852 @@ -3730,6 +4060,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18853         balance_callback(rq);
18854         preempt_enable();
18855  }
18856 +#else
18857 +static inline int rt_effective_prio(struct task_struct *p, int prio)
18858 +{
18859 +       return prio;
18860 +}
18861  #endif
18862
18863  void set_user_nice(struct task_struct *p, long nice)
18864 @@ -3974,10 +4309,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
18865          * Keep a potential priority boosting if called from
18866          * sched_setscheduler().
18867          */
18868 +       p->prio = normal_prio(p);
18869         if (keep_boost)
18870 -               p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
18871 -       else
18872 -               p->prio = normal_prio(p);
18873 +               p->prio = rt_effective_prio(p, p->prio);
18874
18875         if (dl_prio(p->prio))
18876                 p->sched_class = &dl_sched_class;
18877 @@ -4264,7 +4598,7 @@ static int __sched_setscheduler(struct task_struct *p,
18878                  * the runqueue. This will be done when the task deboost
18879                  * itself.
18880                  */
18881 -               new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
18882 +               new_effective_prio = rt_effective_prio(p, newprio);
18883                 if (new_effective_prio == oldprio)
18884                         queue_flags &= ~DEQUEUE_MOVE;
18885         }
18886 @@ -4939,6 +5273,7 @@ int __cond_resched_lock(spinlock_t *lock)
18887  }
18888  EXPORT_SYMBOL(__cond_resched_lock);
18889
18890 +#ifndef CONFIG_PREEMPT_RT_FULL
18891  int __sched __cond_resched_softirq(void)
18892  {
18893         BUG_ON(!in_softirq());
18894 @@ -4952,6 +5287,7 @@ int __sched __cond_resched_softirq(void)
18895         return 0;
18896  }
18897  EXPORT_SYMBOL(__cond_resched_softirq);
18898 +#endif
18899
18900  /**
18901   * yield - yield the current processor to other threads.
18902 @@ -5315,7 +5651,9 @@ void init_idle(struct task_struct *idle, int cpu)
18903
18904         /* Set the preempt count _outside_ the spinlocks! */
18905         init_idle_preempt_count(idle, cpu);
18906 -
18907 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
18908 +       task_thread_info(idle)->preempt_lazy_count = 0;
18909 +#endif
18910         /*
18911          * The idle tasks have their own, simple scheduling class:
18912          */
18913 @@ -5458,6 +5796,8 @@ void sched_setnuma(struct task_struct *p, int nid)
18914  #endif /* CONFIG_NUMA_BALANCING */
18915
18916  #ifdef CONFIG_HOTPLUG_CPU
18917 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
18918 +
18919  /*
18920   * Ensures that the idle task is using init_mm right before its cpu goes
18921   * offline.
18922 @@ -5472,7 +5812,12 @@ void idle_task_exit(void)
18923                 switch_mm(mm, &init_mm, current);
18924                 finish_arch_post_lock_switch();
18925         }
18926 -       mmdrop(mm);
18927 +       /*
18928 +        * Defer the cleanup to an alive cpu. On RT we can neither
18929 +        * call mmdrop() nor mmdrop_delayed() from here.
18930 +        */
18931 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
18932 +
18933  }
18934
18935  /*
18936 @@ -5881,6 +6226,7 @@ static int init_rootdomain(struct root_domain *rd)
18937         rd->rto_cpu = -1;
18938         raw_spin_lock_init(&rd->rto_lock);
18939         init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
18940 +       rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
18941  #endif
18942
18943         init_dl_bw(&rd->dl_bw);
18944 @@ -7439,6 +7785,10 @@ int sched_cpu_dying(unsigned int cpu)
18945         update_max_interval();
18946         nohz_balance_exit_idle(cpu);
18947         hrtick_clear(rq);
18948 +       if (per_cpu(idle_last_mm, cpu)) {
18949 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
18950 +               per_cpu(idle_last_mm, cpu) = NULL;
18951 +       }
18952         return 0;
18953  }
18954  #endif
18955 @@ -7700,7 +8050,7 @@ void __init sched_init(void)
18956  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
18957  static inline int preempt_count_equals(int preempt_offset)
18958  {
18959 -       int nested = preempt_count() + rcu_preempt_depth();
18960 +       int nested = preempt_count() + sched_rcu_preempt_depth();
18961
18962         return (nested == preempt_offset);
18963  }
18964 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
18965 index c95c5122b105..e00accf92a4b 100644
18966 --- a/kernel/sched/deadline.c
18967 +++ b/kernel/sched/deadline.c
18968 @@ -687,6 +687,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
18969
18970         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
18971         timer->function = dl_task_timer;
18972 +       timer->irqsafe = 1;
18973  }
18974
18975  static
18976 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
18977 index fa178b62ea79..935224123441 100644
18978 --- a/kernel/sched/debug.c
18979 +++ b/kernel/sched/debug.c
18980 @@ -558,6 +558,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
18981         P(rt_throttled);
18982         PN(rt_time);
18983         PN(rt_runtime);
18984 +#ifdef CONFIG_SMP
18985 +       P(rt_nr_migratory);
18986 +#endif
18987
18988  #undef PN
18989  #undef P
18990 @@ -953,6 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
18991  #endif
18992         P(policy);
18993         P(prio);
18994 +#ifdef CONFIG_PREEMPT_RT_FULL
18995 +       P(migrate_disable);
18996 +#endif
18997 +       P(nr_cpus_allowed);
18998  #undef PN_SCHEDSTAT
18999  #undef PN
19000  #undef __PN
19001 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
19002 index 7a68c631d5b5..f9189909640b 100644
19003 --- a/kernel/sched/fair.c
19004 +++ b/kernel/sched/fair.c
19005 @@ -3518,7 +3518,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
19006         ideal_runtime = sched_slice(cfs_rq, curr);
19007         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
19008         if (delta_exec > ideal_runtime) {
19009 -               resched_curr(rq_of(cfs_rq));
19010 +               resched_curr_lazy(rq_of(cfs_rq));
19011                 /*
19012                  * The current task ran long enough, ensure it doesn't get
19013                  * re-elected due to buddy favours.
19014 @@ -3542,7 +3542,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
19015                 return;
19016
19017         if (delta > ideal_runtime)
19018 -               resched_curr(rq_of(cfs_rq));
19019 +               resched_curr_lazy(rq_of(cfs_rq));
19020  }
19021
19022  static void
19023 @@ -3684,7 +3684,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
19024          * validating it and just reschedule.
19025          */
19026         if (queued) {
19027 -               resched_curr(rq_of(cfs_rq));
19028 +               resched_curr_lazy(rq_of(cfs_rq));
19029                 return;
19030         }
19031         /*
19032 @@ -3866,7 +3866,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
19033          * hierarchy can be throttled
19034          */
19035         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
19036 -               resched_curr(rq_of(cfs_rq));
19037 +               resched_curr_lazy(rq_of(cfs_rq));
19038  }
19039
19040  static __always_inline
19041 @@ -4494,7 +4494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
19042
19043                 if (delta < 0) {
19044                         if (rq->curr == p)
19045 -                               resched_curr(rq);
19046 +                               resched_curr_lazy(rq);
19047                         return;
19048                 }
19049                 hrtick_start(rq, delta);
19050 @@ -5862,7 +5862,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
19051         return;
19052
19053  preempt:
19054 -       resched_curr(rq);
19055 +       resched_curr_lazy(rq);
19056         /*
19057          * Only set the backward buddy when the current task is still
19058          * on the rq. This can happen when a wakeup gets interleaved
19059 @@ -8588,7 +8588,7 @@ static void task_fork_fair(struct task_struct *p)
19060                  * 'current' within the tree based on its new key value.
19061                  */
19062                 swap(curr->vruntime, se->vruntime);
19063 -               resched_curr(rq);
19064 +               resched_curr_lazy(rq);
19065         }
19066
19067         se->vruntime -= cfs_rq->min_vruntime;
19068 @@ -8612,7 +8612,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
19069          */
19070         if (rq->curr == p) {
19071                 if (p->prio > oldprio)
19072 -                       resched_curr(rq);
19073 +                       resched_curr_lazy(rq);
19074         } else
19075                 check_preempt_curr(rq, p, 0);
19076  }
19077 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
19078 index 69631fa46c2f..6d28fcd08872 100644
19079 --- a/kernel/sched/features.h
19080 +++ b/kernel/sched/features.h
19081 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
19082   */
19083  SCHED_FEAT(NONTASK_CAPACITY, true)
19084
19085 +#ifdef CONFIG_PREEMPT_RT_FULL
19086 +SCHED_FEAT(TTWU_QUEUE, false)
19087 +# ifdef CONFIG_PREEMPT_LAZY
19088 +SCHED_FEAT(PREEMPT_LAZY, true)
19089 +# endif
19090 +#else
19091 +
19092  /*
19093   * Queue remote wakeups on the target CPU and process them
19094   * using the scheduler IPI. Reduces rq->lock contention/bounces.
19095   */
19096  SCHED_FEAT(TTWU_QUEUE, true)
19097 +#endif
19098
19099  #ifdef HAVE_RT_PUSH_IPI
19100  /*
19101 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
19102 index 9c131168d933..340a0a5d435c 100644
19103 --- a/kernel/sched/rt.c
19104 +++ b/kernel/sched/rt.c
19105 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
19106
19107         hrtimer_init(&rt_b->rt_period_timer,
19108                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
19109 +       rt_b->rt_period_timer.irqsafe = 1;
19110         rt_b->rt_period_timer.function = sched_rt_period_timer;
19111  }
19112
19113 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
19114 index cff985feb6e7..280c7d5a7657 100644
19115 --- a/kernel/sched/sched.h
19116 +++ b/kernel/sched/sched.h
19117 @@ -1162,6 +1162,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
19118  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
19119  #define WF_FORK                0x02            /* child wakeup after fork */
19120  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
19121 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
19122
19123  /*
19124   * To aid in avoiding the subversion of "niceness" due to uneven distribution
19125 @@ -1345,6 +1346,15 @@ extern void init_sched_fair_class(void);
19126  extern void resched_curr(struct rq *rq);
19127  extern void resched_cpu(int cpu);
19128
19129 +#ifdef CONFIG_PREEMPT_LAZY
19130 +extern void resched_curr_lazy(struct rq *rq);
19131 +#else
19132 +static inline void resched_curr_lazy(struct rq *rq)
19133 +{
19134 +       resched_curr(rq);
19135 +}
19136 +#endif
19137 +
19138  extern struct rt_bandwidth def_rt_bandwidth;
19139  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
19140
19141 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
19142 index 82f0dff90030..ef027ff3250a 100644
19143 --- a/kernel/sched/swait.c
19144 +++ b/kernel/sched/swait.c
19145 @@ -1,5 +1,6 @@
19146  #include <linux/sched.h>
19147  #include <linux/swait.h>
19148 +#include <linux/suspend.h>
19149
19150  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
19151                              struct lock_class_key *key)
19152 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
19153  }
19154  EXPORT_SYMBOL(swake_up_locked);
19155
19156 +void swake_up_all_locked(struct swait_queue_head *q)
19157 +{
19158 +       struct swait_queue *curr;
19159 +       int wakes = 0;
19160 +
19161 +       while (!list_empty(&q->task_list)) {
19162 +
19163 +               curr = list_first_entry(&q->task_list, typeof(*curr),
19164 +                                       task_list);
19165 +               wake_up_process(curr->task);
19166 +               list_del_init(&curr->task_list);
19167 +               wakes++;
19168 +       }
19169 +       if (pm_in_action)
19170 +               return;
19171 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
19172 +}
19173 +EXPORT_SYMBOL(swake_up_all_locked);
19174 +
19175  void swake_up(struct swait_queue_head *q)
19176  {
19177         unsigned long flags;
19178 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
19179         if (!swait_active(q))
19180                 return;
19181
19182 +       WARN_ON(irqs_disabled());
19183         raw_spin_lock_irq(&q->lock);
19184         list_splice_init(&q->task_list, &tmp);
19185         while (!list_empty(&tmp)) {
19186 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
19187 new file mode 100644
19188 index 000000000000..1950f40ca725
19189 --- /dev/null
19190 +++ b/kernel/sched/swork.c
19191 @@ -0,0 +1,173 @@
19192 +/*
19193 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
19194 + *
19195 + * Provides a framework for enqueuing callbacks from irq context
19196 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
19197 + */
19198 +
19199 +#include <linux/swait.h>
19200 +#include <linux/swork.h>
19201 +#include <linux/kthread.h>
19202 +#include <linux/slab.h>
19203 +#include <linux/spinlock.h>
19204 +#include <linux/export.h>
19205 +
19206 +#define SWORK_EVENT_PENDING     (1 << 0)
19207 +
19208 +static DEFINE_MUTEX(worker_mutex);
19209 +static struct sworker *glob_worker;
19210 +
19211 +struct sworker {
19212 +       struct list_head events;
19213 +       struct swait_queue_head wq;
19214 +
19215 +       raw_spinlock_t lock;
19216 +
19217 +       struct task_struct *task;
19218 +       int refs;
19219 +};
19220 +
19221 +static bool swork_readable(struct sworker *worker)
19222 +{
19223 +       bool r;
19224 +
19225 +       if (kthread_should_stop())
19226 +               return true;
19227 +
19228 +       raw_spin_lock_irq(&worker->lock);
19229 +       r = !list_empty(&worker->events);
19230 +       raw_spin_unlock_irq(&worker->lock);
19231 +
19232 +       return r;
19233 +}
19234 +
19235 +static int swork_kthread(void *arg)
19236 +{
19237 +       struct sworker *worker = arg;
19238 +
19239 +       for (;;) {
19240 +               swait_event_interruptible(worker->wq,
19241 +                                       swork_readable(worker));
19242 +               if (kthread_should_stop())
19243 +                       break;
19244 +
19245 +               raw_spin_lock_irq(&worker->lock);
19246 +               while (!list_empty(&worker->events)) {
19247 +                       struct swork_event *sev;
19248 +
19249 +                       sev = list_first_entry(&worker->events,
19250 +                                       struct swork_event, item);
19251 +                       list_del(&sev->item);
19252 +                       raw_spin_unlock_irq(&worker->lock);
19253 +
19254 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
19255 +                                                        &sev->flags));
19256 +                       sev->func(sev);
19257 +                       raw_spin_lock_irq(&worker->lock);
19258 +               }
19259 +               raw_spin_unlock_irq(&worker->lock);
19260 +       }
19261 +       return 0;
19262 +}
19263 +
19264 +static struct sworker *swork_create(void)
19265 +{
19266 +       struct sworker *worker;
19267 +
19268 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
19269 +       if (!worker)
19270 +               return ERR_PTR(-ENOMEM);
19271 +
19272 +       INIT_LIST_HEAD(&worker->events);
19273 +       raw_spin_lock_init(&worker->lock);
19274 +       init_swait_queue_head(&worker->wq);
19275 +
19276 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
19277 +       if (IS_ERR(worker->task)) {
19278 +               kfree(worker);
19279 +               return ERR_PTR(-ENOMEM);
19280 +       }
19281 +
19282 +       return worker;
19283 +}
19284 +
19285 +static void swork_destroy(struct sworker *worker)
19286 +{
19287 +       kthread_stop(worker->task);
19288 +
19289 +       WARN_ON(!list_empty(&worker->events));
19290 +       kfree(worker);
19291 +}
19292 +
19293 +/**
19294 + * swork_queue - queue swork
19295 + *
19296 + * Returns %false if @work was already on a queue, %true otherwise.
19297 + *
19298 + * The work is queued and processed on a random CPU
19299 + */
19300 +bool swork_queue(struct swork_event *sev)
19301 +{
19302 +       unsigned long flags;
19303 +
19304 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
19305 +               return false;
19306 +
19307 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
19308 +       list_add_tail(&sev->item, &glob_worker->events);
19309 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
19310 +
19311 +       swake_up(&glob_worker->wq);
19312 +       return true;
19313 +}
19314 +EXPORT_SYMBOL_GPL(swork_queue);
19315 +
19316 +/**
19317 + * swork_get - get an instance of the sworker
19318 + *
19319 + * Returns an negative error code if the initialization if the worker did not
19320 + * work, %0 otherwise.
19321 + *
19322 + */
19323 +int swork_get(void)
19324 +{
19325 +       struct sworker *worker;
19326 +
19327 +       mutex_lock(&worker_mutex);
19328 +       if (!glob_worker) {
19329 +               worker = swork_create();
19330 +               if (IS_ERR(worker)) {
19331 +                       mutex_unlock(&worker_mutex);
19332 +                       return -ENOMEM;
19333 +               }
19334 +
19335 +               glob_worker = worker;
19336 +       }
19337 +
19338 +       glob_worker->refs++;
19339 +       mutex_unlock(&worker_mutex);
19340 +
19341 +       return 0;
19342 +}
19343 +EXPORT_SYMBOL_GPL(swork_get);
19344 +
19345 +/**
19346 + * swork_put - puts an instance of the sworker
19347 + *
19348 + * Will destroy the sworker thread. This function must not be called until all
19349 + * queued events have been completed.
19350 + */
19351 +void swork_put(void)
19352 +{
19353 +       mutex_lock(&worker_mutex);
19354 +
19355 +       glob_worker->refs--;
19356 +       if (glob_worker->refs > 0)
19357 +               goto out;
19358 +
19359 +       swork_destroy(glob_worker);
19360 +       glob_worker = NULL;
19361 +out:
19362 +       mutex_unlock(&worker_mutex);
19363 +}
19364 +EXPORT_SYMBOL_GPL(swork_put);
19365 diff --git a/kernel/signal.c b/kernel/signal.c
19366 index e48668c3c972..99918dcd836f 100644
19367 --- a/kernel/signal.c
19368 +++ b/kernel/signal.c
19369 @@ -14,6 +14,7 @@
19370  #include <linux/export.h>
19371  #include <linux/init.h>
19372  #include <linux/sched.h>
19373 +#include <linux/sched/rt.h>
19374  #include <linux/fs.h>
19375  #include <linux/tty.h>
19376  #include <linux/binfmts.h>
19377 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
19378         return false;
19379  }
19380
19381 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
19382 +{
19383 +       struct sigqueue *q = t->sigqueue_cache;
19384 +
19385 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
19386 +               return NULL;
19387 +       return q;
19388 +}
19389 +
19390 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
19391 +{
19392 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
19393 +               return 0;
19394 +       return 1;
19395 +}
19396 +
19397  /*
19398   * allocate a new signal queue record
19399   * - this may be called without locks if and only if t == current, otherwise an
19400   *   appropriate lock must be held to stop the target task from exiting
19401   */
19402  static struct sigqueue *
19403 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
19404 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
19405 +                   int override_rlimit, int fromslab)
19406  {
19407         struct sigqueue *q = NULL;
19408         struct user_struct *user;
19409 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
19410         if (override_rlimit ||
19411             atomic_read(&user->sigpending) <=
19412                         task_rlimit(t, RLIMIT_SIGPENDING)) {
19413 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
19414 +               if (!fromslab)
19415 +                       q = get_task_cache(t);
19416 +               if (!q)
19417 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
19418         } else {
19419                 print_dropped_signal(sig);
19420         }
19421 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
19422         return q;
19423  }
19424
19425 +static struct sigqueue *
19426 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
19427 +                int override_rlimit)
19428 +{
19429 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
19430 +}
19431 +
19432  static void __sigqueue_free(struct sigqueue *q)
19433  {
19434         if (q->flags & SIGQUEUE_PREALLOC)
19435 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
19436         kmem_cache_free(sigqueue_cachep, q);
19437  }
19438
19439 +static void sigqueue_free_current(struct sigqueue *q)
19440 +{
19441 +       struct user_struct *up;
19442 +
19443 +       if (q->flags & SIGQUEUE_PREALLOC)
19444 +               return;
19445 +
19446 +       up = q->user;
19447 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
19448 +               atomic_dec(&up->sigpending);
19449 +               free_uid(up);
19450 +       } else
19451 +                 __sigqueue_free(q);
19452 +}
19453 +
19454  void flush_sigqueue(struct sigpending *queue)
19455  {
19456         struct sigqueue *q;
19457 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
19458  }
19459
19460  /*
19461 + * Called from __exit_signal. Flush tsk->pending and
19462 + * tsk->sigqueue_cache
19463 + */
19464 +void flush_task_sigqueue(struct task_struct *tsk)
19465 +{
19466 +       struct sigqueue *q;
19467 +
19468 +       flush_sigqueue(&tsk->pending);
19469 +
19470 +       q = get_task_cache(tsk);
19471 +       if (q)
19472 +               kmem_cache_free(sigqueue_cachep, q);
19473 +}
19474 +
19475 +/*
19476   * Flush all pending signals for this kthread.
19477   */
19478  void flush_signals(struct task_struct *t)
19479 @@ -532,7 +590,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
19480                         (info->si_code == SI_TIMER) &&
19481                         (info->si_sys_private);
19482
19483 -               __sigqueue_free(first);
19484 +               sigqueue_free_current(first);
19485         } else {
19486                 /*
19487                  * Ok, it wasn't in the queue.  This must be
19488 @@ -568,6 +626,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
19489         bool resched_timer = false;
19490         int signr;
19491
19492 +       WARN_ON_ONCE(tsk != current);
19493 +
19494         /* We only dequeue private signals from ourselves, we don't let
19495          * signalfd steal them
19496          */
19497 @@ -1164,8 +1224,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
19498   * We don't want to have recursive SIGSEGV's etc, for example,
19499   * that is why we also clear SIGNAL_UNKILLABLE.
19500   */
19501 -int
19502 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19503 +static int
19504 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19505  {
19506         unsigned long int flags;
19507         int ret, blocked, ignored;
19508 @@ -1190,6 +1250,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19509         return ret;
19510  }
19511
19512 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
19513 +{
19514 +/*
19515 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
19516 + * since it can not enable preemption, and the signal code's spin_locks
19517 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
19518 + * send the signal on exit of the trap.
19519 + */
19520 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
19521 +       if (in_atomic()) {
19522 +               if (WARN_ON_ONCE(t != current))
19523 +                       return 0;
19524 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
19525 +                       return 0;
19526 +
19527 +               if (is_si_special(info)) {
19528 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
19529 +                       t->forced_info.si_signo = sig;
19530 +                       t->forced_info.si_errno = 0;
19531 +                       t->forced_info.si_code = SI_KERNEL;
19532 +                       t->forced_info.si_pid = 0;
19533 +                       t->forced_info.si_uid = 0;
19534 +               } else {
19535 +                       t->forced_info = *info;
19536 +               }
19537 +
19538 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
19539 +               return 0;
19540 +       }
19541 +#endif
19542 +       return do_force_sig_info(sig, info, t);
19543 +}
19544 +
19545  /*
19546   * Nuke all other threads in the group.
19547   */
19548 @@ -1224,12 +1317,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
19549                  * Disable interrupts early to avoid deadlocks.
19550                  * See rcu_read_unlock() comment header for details.
19551                  */
19552 -               local_irq_save(*flags);
19553 +               local_irq_save_nort(*flags);
19554                 rcu_read_lock();
19555                 sighand = rcu_dereference(tsk->sighand);
19556                 if (unlikely(sighand == NULL)) {
19557                         rcu_read_unlock();
19558 -                       local_irq_restore(*flags);
19559 +                       local_irq_restore_nort(*flags);
19560                         break;
19561                 }
19562                 /*
19563 @@ -1250,7 +1343,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
19564                 }
19565                 spin_unlock(&sighand->siglock);
19566                 rcu_read_unlock();
19567 -               local_irq_restore(*flags);
19568 +               local_irq_restore_nort(*flags);
19569         }
19570
19571         return sighand;
19572 @@ -1493,7 +1586,8 @@ EXPORT_SYMBOL(kill_pid);
19573   */
19574  struct sigqueue *sigqueue_alloc(void)
19575  {
19576 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
19577 +       /* Preallocated sigqueue objects always from the slabcache ! */
19578 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
19579
19580         if (q)
19581                 q->flags |= SIGQUEUE_PREALLOC;
19582 @@ -1854,15 +1948,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
19583                 if (gstop_done && ptrace_reparented(current))
19584                         do_notify_parent_cldstop(current, false, why);
19585
19586 -               /*
19587 -                * Don't want to allow preemption here, because
19588 -                * sys_ptrace() needs this task to be inactive.
19589 -                *
19590 -                * XXX: implement read_unlock_no_resched().
19591 -                */
19592 -               preempt_disable();
19593                 read_unlock(&tasklist_lock);
19594 -               preempt_enable_no_resched();
19595                 freezable_schedule();
19596         } else {
19597                 /*
19598 diff --git a/kernel/softirq.c b/kernel/softirq.c
19599 index 744fa611cae0..819bd7cf5ad0 100644
19600 --- a/kernel/softirq.c
19601 +++ b/kernel/softirq.c
19602 @@ -21,10 +21,12 @@
19603  #include <linux/freezer.h>
19604  #include <linux/kthread.h>
19605  #include <linux/rcupdate.h>
19606 +#include <linux/delay.h>
19607  #include <linux/ftrace.h>
19608  #include <linux/smp.h>
19609  #include <linux/smpboot.h>
19610  #include <linux/tick.h>
19611 +#include <linux/locallock.h>
19612  #include <linux/irq.h>
19613
19614  #define CREATE_TRACE_POINTS
19615 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
19616  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
19617
19618  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
19619 +#ifdef CONFIG_PREEMPT_RT_FULL
19620 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
19621 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
19622 +#endif
19623
19624  const char * const softirq_to_name[NR_SOFTIRQS] = {
19625         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
19626         "TASKLET", "SCHED", "HRTIMER", "RCU"
19627  };
19628
19629 +#ifdef CONFIG_NO_HZ_COMMON
19630 +# ifdef CONFIG_PREEMPT_RT_FULL
19631 +
19632 +struct softirq_runner {
19633 +       struct task_struct *runner[NR_SOFTIRQS];
19634 +};
19635 +
19636 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
19637 +
19638 +static inline void softirq_set_runner(unsigned int sirq)
19639 +{
19640 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
19641 +
19642 +       sr->runner[sirq] = current;
19643 +}
19644 +
19645 +static inline void softirq_clr_runner(unsigned int sirq)
19646 +{
19647 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
19648 +
19649 +       sr->runner[sirq] = NULL;
19650 +}
19651 +
19652 +/*
19653 + * On preempt-rt a softirq running context might be blocked on a
19654 + * lock. There might be no other runnable task on this CPU because the
19655 + * lock owner runs on some other CPU. So we have to go into idle with
19656 + * the pending bit set. Therefor we need to check this otherwise we
19657 + * warn about false positives which confuses users and defeats the
19658 + * whole purpose of this test.
19659 + *
19660 + * This code is called with interrupts disabled.
19661 + */
19662 +void softirq_check_pending_idle(void)
19663 +{
19664 +       static int rate_limit;
19665 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
19666 +       u32 warnpending;
19667 +       int i;
19668 +
19669 +       if (rate_limit >= 10)
19670 +               return;
19671 +
19672 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
19673 +       for (i = 0; i < NR_SOFTIRQS; i++) {
19674 +               struct task_struct *tsk = sr->runner[i];
19675 +
19676 +               /*
19677 +                * The wakeup code in rtmutex.c wakes up the task
19678 +                * _before_ it sets pi_blocked_on to NULL under
19679 +                * tsk->pi_lock. So we need to check for both: state
19680 +                * and pi_blocked_on.
19681 +                */
19682 +               if (tsk) {
19683 +                       raw_spin_lock(&tsk->pi_lock);
19684 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
19685 +                               /* Clear all bits pending in that task */
19686 +                               warnpending &= ~(tsk->softirqs_raised);
19687 +                               warnpending &= ~(1 << i);
19688 +                       }
19689 +                       raw_spin_unlock(&tsk->pi_lock);
19690 +               }
19691 +       }
19692 +
19693 +       if (warnpending) {
19694 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
19695 +                      warnpending);
19696 +               rate_limit++;
19697 +       }
19698 +}
19699 +# else
19700 +/*
19701 + * On !PREEMPT_RT we just printk rate limited:
19702 + */
19703 +void softirq_check_pending_idle(void)
19704 +{
19705 +       static int rate_limit;
19706 +
19707 +       if (rate_limit < 10 &&
19708 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
19709 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
19710 +                      local_softirq_pending());
19711 +               rate_limit++;
19712 +       }
19713 +}
19714 +# endif
19715 +
19716 +#else /* !CONFIG_NO_HZ_COMMON */
19717 +static inline void softirq_set_runner(unsigned int sirq) { }
19718 +static inline void softirq_clr_runner(unsigned int sirq) { }
19719 +#endif
19720 +
19721  /*
19722   * we cannot loop indefinitely here to avoid userspace starvation,
19723   * but we also don't want to introduce a worst case 1/HZ latency
19724 @@ -77,6 +175,38 @@ static void wakeup_softirqd(void)
19725                 wake_up_process(tsk);
19726  }
19727
19728 +#ifdef CONFIG_PREEMPT_RT_FULL
19729 +static void wakeup_timer_softirqd(void)
19730 +{
19731 +       /* Interrupts are disabled: no need to stop preemption */
19732 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
19733 +
19734 +       if (tsk && tsk->state != TASK_RUNNING)
19735 +               wake_up_process(tsk);
19736 +}
19737 +#endif
19738 +
19739 +static void handle_softirq(unsigned int vec_nr)
19740 +{
19741 +       struct softirq_action *h = softirq_vec + vec_nr;
19742 +       int prev_count;
19743 +
19744 +       prev_count = preempt_count();
19745 +
19746 +       kstat_incr_softirqs_this_cpu(vec_nr);
19747 +
19748 +       trace_softirq_entry(vec_nr);
19749 +       h->action(h);
19750 +       trace_softirq_exit(vec_nr);
19751 +       if (unlikely(prev_count != preempt_count())) {
19752 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
19753 +                      vec_nr, softirq_to_name[vec_nr], h->action,
19754 +                      prev_count, preempt_count());
19755 +               preempt_count_set(prev_count);
19756 +       }
19757 +}
19758 +
19759 +#ifndef CONFIG_PREEMPT_RT_FULL
19760  /*
19761   * If ksoftirqd is scheduled, we do not want to process pending softirqs
19762   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
19763 @@ -88,6 +218,47 @@ static bool ksoftirqd_running(void)
19764         return tsk && (tsk->state == TASK_RUNNING);
19765  }
19766
19767 +static inline int ksoftirqd_softirq_pending(void)
19768 +{
19769 +       return local_softirq_pending();
19770 +}
19771 +
19772 +static void handle_pending_softirqs(u32 pending)
19773 +{
19774 +       struct softirq_action *h = softirq_vec;
19775 +       int softirq_bit;
19776 +
19777 +       local_irq_enable();
19778 +
19779 +       h = softirq_vec;
19780 +
19781 +       while ((softirq_bit = ffs(pending))) {
19782 +               unsigned int vec_nr;
19783 +
19784 +               h += softirq_bit - 1;
19785 +               vec_nr = h - softirq_vec;
19786 +               handle_softirq(vec_nr);
19787 +
19788 +               h++;
19789 +               pending >>= softirq_bit;
19790 +       }
19791 +
19792 +       rcu_bh_qs();
19793 +       local_irq_disable();
19794 +}
19795 +
19796 +static void run_ksoftirqd(unsigned int cpu)
19797 +{
19798 +       local_irq_disable();
19799 +       if (ksoftirqd_softirq_pending()) {
19800 +               __do_softirq();
19801 +               local_irq_enable();
19802 +               cond_resched_rcu_qs();
19803 +               return;
19804 +       }
19805 +       local_irq_enable();
19806 +}
19807 +
19808  /*
19809   * preempt_count and SOFTIRQ_OFFSET usage:
19810   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
19811 @@ -243,10 +414,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
19812         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
19813         unsigned long old_flags = current->flags;
19814         int max_restart = MAX_SOFTIRQ_RESTART;
19815 -       struct softirq_action *h;
19816         bool in_hardirq;
19817         __u32 pending;
19818 -       int softirq_bit;
19819
19820         /*
19821          * Mask out PF_MEMALLOC s current task context is borrowed for the
19822 @@ -265,36 +434,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
19823         /* Reset the pending bitmask before enabling irqs */
19824         set_softirq_pending(0);
19825
19826 -       local_irq_enable();
19827 -
19828 -       h = softirq_vec;
19829 -
19830 -       while ((softirq_bit = ffs(pending))) {
19831 -               unsigned int vec_nr;
19832 -               int prev_count;
19833 -
19834 -               h += softirq_bit - 1;
19835 -
19836 -               vec_nr = h - softirq_vec;
19837 -               prev_count = preempt_count();
19838 -
19839 -               kstat_incr_softirqs_this_cpu(vec_nr);
19840 -
19841 -               trace_softirq_entry(vec_nr);
19842 -               h->action(h);
19843 -               trace_softirq_exit(vec_nr);
19844 -               if (unlikely(prev_count != preempt_count())) {
19845 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
19846 -                              vec_nr, softirq_to_name[vec_nr], h->action,
19847 -                              prev_count, preempt_count());
19848 -                       preempt_count_set(prev_count);
19849 -               }
19850 -               h++;
19851 -               pending >>= softirq_bit;
19852 -       }
19853 -
19854 -       rcu_bh_qs();
19855 -       local_irq_disable();
19856 +       handle_pending_softirqs(pending);
19857
19858         pending = local_softirq_pending();
19859         if (pending) {
19860 @@ -331,6 +471,309 @@ asmlinkage __visible void do_softirq(void)
19861  }
19862
19863  /*
19864 + * This function must run with irqs disabled!
19865 + */
19866 +void raise_softirq_irqoff(unsigned int nr)
19867 +{
19868 +       __raise_softirq_irqoff(nr);
19869 +
19870 +       /*
19871 +        * If we're in an interrupt or softirq, we're done
19872 +        * (this also catches softirq-disabled code). We will
19873 +        * actually run the softirq once we return from
19874 +        * the irq or softirq.
19875 +        *
19876 +        * Otherwise we wake up ksoftirqd to make sure we
19877 +        * schedule the softirq soon.
19878 +        */
19879 +       if (!in_interrupt())
19880 +               wakeup_softirqd();
19881 +}
19882 +
19883 +void __raise_softirq_irqoff(unsigned int nr)
19884 +{
19885 +       trace_softirq_raise(nr);
19886 +       or_softirq_pending(1UL << nr);
19887 +}
19888 +
19889 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
19890 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
19891 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
19892 +
19893 +#else /* !PREEMPT_RT_FULL */
19894 +
19895 +/*
19896 + * On RT we serialize softirq execution with a cpu local lock per softirq
19897 + */
19898 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
19899 +
19900 +void __init softirq_early_init(void)
19901 +{
19902 +       int i;
19903 +
19904 +       for (i = 0; i < NR_SOFTIRQS; i++)
19905 +               local_irq_lock_init(local_softirq_locks[i]);
19906 +}
19907 +
19908 +static void lock_softirq(int which)
19909 +{
19910 +       local_lock(local_softirq_locks[which]);
19911 +}
19912 +
19913 +static void unlock_softirq(int which)
19914 +{
19915 +       local_unlock(local_softirq_locks[which]);
19916 +}
19917 +
19918 +static void do_single_softirq(int which)
19919 +{
19920 +       unsigned long old_flags = current->flags;
19921 +
19922 +       current->flags &= ~PF_MEMALLOC;
19923 +       vtime_account_irq_enter(current);
19924 +       current->flags |= PF_IN_SOFTIRQ;
19925 +       lockdep_softirq_enter();
19926 +       local_irq_enable();
19927 +       handle_softirq(which);
19928 +       local_irq_disable();
19929 +       lockdep_softirq_exit();
19930 +       current->flags &= ~PF_IN_SOFTIRQ;
19931 +       vtime_account_irq_enter(current);
19932 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
19933 +}
19934 +
19935 +/*
19936 + * Called with interrupts disabled. Process softirqs which were raised
19937 + * in current context (or on behalf of ksoftirqd).
19938 + */
19939 +static void do_current_softirqs(void)
19940 +{
19941 +       while (current->softirqs_raised) {
19942 +               int i = __ffs(current->softirqs_raised);
19943 +               unsigned int pending, mask = (1U << i);
19944 +
19945 +               current->softirqs_raised &= ~mask;
19946 +               local_irq_enable();
19947 +
19948 +               /*
19949 +                * If the lock is contended, we boost the owner to
19950 +                * process the softirq or leave the critical section
19951 +                * now.
19952 +                */
19953 +               lock_softirq(i);
19954 +               local_irq_disable();
19955 +               softirq_set_runner(i);
19956 +               /*
19957 +                * Check with the local_softirq_pending() bits,
19958 +                * whether we need to process this still or if someone
19959 +                * else took care of it.
19960 +                */
19961 +               pending = local_softirq_pending();
19962 +               if (pending & mask) {
19963 +                       set_softirq_pending(pending & ~mask);
19964 +                       do_single_softirq(i);
19965 +               }
19966 +               softirq_clr_runner(i);
19967 +               WARN_ON(current->softirq_nestcnt != 1);
19968 +               local_irq_enable();
19969 +               unlock_softirq(i);
19970 +               local_irq_disable();
19971 +       }
19972 +}
19973 +
19974 +void __local_bh_disable(void)
19975 +{
19976 +       if (++current->softirq_nestcnt == 1)
19977 +               migrate_disable();
19978 +}
19979 +EXPORT_SYMBOL(__local_bh_disable);
19980 +
19981 +void __local_bh_enable(void)
19982 +{
19983 +       if (WARN_ON(current->softirq_nestcnt == 0))
19984 +               return;
19985 +
19986 +       local_irq_disable();
19987 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
19988 +               do_current_softirqs();
19989 +       local_irq_enable();
19990 +
19991 +       if (--current->softirq_nestcnt == 0)
19992 +               migrate_enable();
19993 +}
19994 +EXPORT_SYMBOL(__local_bh_enable);
19995 +
19996 +void _local_bh_enable(void)
19997 +{
19998 +       if (WARN_ON(current->softirq_nestcnt == 0))
19999 +               return;
20000 +       if (--current->softirq_nestcnt == 0)
20001 +               migrate_enable();
20002 +}
20003 +EXPORT_SYMBOL(_local_bh_enable);
20004 +
20005 +int in_serving_softirq(void)
20006 +{
20007 +       return current->flags & PF_IN_SOFTIRQ;
20008 +}
20009 +EXPORT_SYMBOL(in_serving_softirq);
20010 +
20011 +/* Called with preemption disabled */
20012 +static void run_ksoftirqd(unsigned int cpu)
20013 +{
20014 +       local_irq_disable();
20015 +       current->softirq_nestcnt++;
20016 +
20017 +       do_current_softirqs();
20018 +       current->softirq_nestcnt--;
20019 +       local_irq_enable();
20020 +       cond_resched_rcu_qs();
20021 +}
20022 +
20023 +/*
20024 + * Called from netif_rx_ni(). Preemption enabled, but migration
20025 + * disabled. So the cpu can't go away under us.
20026 + */
20027 +void thread_do_softirq(void)
20028 +{
20029 +       if (!in_serving_softirq() && current->softirqs_raised) {
20030 +               current->softirq_nestcnt++;
20031 +               do_current_softirqs();
20032 +               current->softirq_nestcnt--;
20033 +       }
20034 +}
20035 +
20036 +static void do_raise_softirq_irqoff(unsigned int nr)
20037 +{
20038 +       unsigned int mask;
20039 +
20040 +       mask = 1UL << nr;
20041 +
20042 +       trace_softirq_raise(nr);
20043 +       or_softirq_pending(mask);
20044 +
20045 +       /*
20046 +        * If we are not in a hard interrupt and inside a bh disabled
20047 +        * region, we simply raise the flag on current. local_bh_enable()
20048 +        * will make sure that the softirq is executed. Otherwise we
20049 +        * delegate it to ksoftirqd.
20050 +        */
20051 +       if (!in_irq() && current->softirq_nestcnt)
20052 +               current->softirqs_raised |= mask;
20053 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
20054 +               return;
20055 +
20056 +       if (mask & TIMER_SOFTIRQS)
20057 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
20058 +       else
20059 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
20060 +}
20061 +
20062 +static void wakeup_proper_softirq(unsigned int nr)
20063 +{
20064 +       if ((1UL << nr) & TIMER_SOFTIRQS)
20065 +               wakeup_timer_softirqd();
20066 +       else
20067 +               wakeup_softirqd();
20068 +}
20069 +
20070 +void __raise_softirq_irqoff(unsigned int nr)
20071 +{
20072 +       do_raise_softirq_irqoff(nr);
20073 +       if (!in_irq() && !current->softirq_nestcnt)
20074 +               wakeup_proper_softirq(nr);
20075 +}
20076 +
20077 +/*
20078 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
20079 + */
20080 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
20081 +{
20082 +       unsigned int mask;
20083 +
20084 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
20085 +                        !__this_cpu_read(ktimer_softirqd)))
20086 +               return;
20087 +       mask = 1UL << nr;
20088 +
20089 +       trace_softirq_raise(nr);
20090 +       or_softirq_pending(mask);
20091 +       if (mask & TIMER_SOFTIRQS)
20092 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
20093 +       else
20094 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
20095 +       wakeup_proper_softirq(nr);
20096 +}
20097 +
20098 +/*
20099 + * This function must run with irqs disabled!
20100 + */
20101 +void raise_softirq_irqoff(unsigned int nr)
20102 +{
20103 +       do_raise_softirq_irqoff(nr);
20104 +
20105 +       /*
20106 +        * If we're in an hard interrupt we let irq return code deal
20107 +        * with the wakeup of ksoftirqd.
20108 +        */
20109 +       if (in_irq())
20110 +               return;
20111 +       /*
20112 +        * If we are in thread context but outside of a bh disabled
20113 +        * region, we need to wake ksoftirqd as well.
20114 +        *
20115 +        * CHECKME: Some of the places which do that could be wrapped
20116 +        * into local_bh_disable/enable pairs. Though it's unclear
20117 +        * whether this is worth the effort. To find those places just
20118 +        * raise a WARN() if the condition is met.
20119 +        */
20120 +       if (!current->softirq_nestcnt)
20121 +               wakeup_proper_softirq(nr);
20122 +}
20123 +
20124 +static inline int ksoftirqd_softirq_pending(void)
20125 +{
20126 +       return current->softirqs_raised;
20127 +}
20128 +
20129 +static inline void local_bh_disable_nort(void) { }
20130 +static inline void _local_bh_enable_nort(void) { }
20131 +
20132 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
20133 +{
20134 +       /* Take over all but timer pending softirqs when starting */
20135 +       local_irq_disable();
20136 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
20137 +       local_irq_enable();
20138 +}
20139 +
20140 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
20141 +{
20142 +       struct sched_param param = { .sched_priority = 1 };
20143 +
20144 +       sched_setscheduler(current, SCHED_FIFO, &param);
20145 +
20146 +       /* Take over timer pending softirqs when starting */
20147 +       local_irq_disable();
20148 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
20149 +       local_irq_enable();
20150 +}
20151 +
20152 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
20153 +                                                   bool online)
20154 +{
20155 +       struct sched_param param = { .sched_priority = 0 };
20156 +
20157 +       sched_setscheduler(current, SCHED_NORMAL, &param);
20158 +}
20159 +
20160 +static int ktimer_softirqd_should_run(unsigned int cpu)
20161 +{
20162 +       return current->softirqs_raised;
20163 +}
20164 +
20165 +#endif /* PREEMPT_RT_FULL */
20166 +/*
20167   * Enter an interrupt context.
20168   */
20169  void irq_enter(void)
20170 @@ -341,9 +784,9 @@ void irq_enter(void)
20171                  * Prevent raise_softirq from needlessly waking up ksoftirqd
20172                  * here, as softirq will be serviced on return from interrupt.
20173                  */
20174 -               local_bh_disable();
20175 +               local_bh_disable_nort();
20176                 tick_irq_enter();
20177 -               _local_bh_enable();
20178 +               _local_bh_enable_nort();
20179         }
20180
20181         __irq_enter();
20182 @@ -351,6 +794,7 @@ void irq_enter(void)
20183
20184  static inline void invoke_softirq(void)
20185  {
20186 +#ifndef CONFIG_PREEMPT_RT_FULL
20187         if (ksoftirqd_running())
20188                 return;
20189
20190 @@ -373,6 +817,18 @@ static inline void invoke_softirq(void)
20191         } else {
20192                 wakeup_softirqd();
20193         }
20194 +#else /* PREEMPT_RT_FULL */
20195 +       unsigned long flags;
20196 +
20197 +       local_irq_save(flags);
20198 +       if (__this_cpu_read(ksoftirqd) &&
20199 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
20200 +               wakeup_softirqd();
20201 +       if (__this_cpu_read(ktimer_softirqd) &&
20202 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
20203 +               wakeup_timer_softirqd();
20204 +       local_irq_restore(flags);
20205 +#endif
20206  }
20207
20208  static inline void tick_irq_exit(void)
20209 @@ -409,26 +865,6 @@ void irq_exit(void)
20210         trace_hardirq_exit(); /* must be last! */
20211  }
20212
20213 -/*
20214 - * This function must run with irqs disabled!
20215 - */
20216 -inline void raise_softirq_irqoff(unsigned int nr)
20217 -{
20218 -       __raise_softirq_irqoff(nr);
20219 -
20220 -       /*
20221 -        * If we're in an interrupt or softirq, we're done
20222 -        * (this also catches softirq-disabled code). We will
20223 -        * actually run the softirq once we return from
20224 -        * the irq or softirq.
20225 -        *
20226 -        * Otherwise we wake up ksoftirqd to make sure we
20227 -        * schedule the softirq soon.
20228 -        */
20229 -       if (!in_interrupt())
20230 -               wakeup_softirqd();
20231 -}
20232 -
20233  void raise_softirq(unsigned int nr)
20234  {
20235         unsigned long flags;
20236 @@ -438,12 +874,6 @@ void raise_softirq(unsigned int nr)
20237         local_irq_restore(flags);
20238  }
20239
20240 -void __raise_softirq_irqoff(unsigned int nr)
20241 -{
20242 -       trace_softirq_raise(nr);
20243 -       or_softirq_pending(1UL << nr);
20244 -}
20245 -
20246  void open_softirq(int nr, void (*action)(struct softirq_action *))
20247  {
20248         softirq_vec[nr].action = action;
20249 @@ -460,15 +890,45 @@ struct tasklet_head {
20250  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
20251  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
20252
20253 +static void inline
20254 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
20255 +{
20256 +       if (tasklet_trylock(t)) {
20257 +again:
20258 +               /* We may have been preempted before tasklet_trylock
20259 +                * and __tasklet_action may have already run.
20260 +                * So double check the sched bit while the takslet
20261 +                * is locked before adding it to the list.
20262 +                */
20263 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
20264 +                       t->next = NULL;
20265 +                       *head->tail = t;
20266 +                       head->tail = &(t->next);
20267 +                       raise_softirq_irqoff(nr);
20268 +                       tasklet_unlock(t);
20269 +               } else {
20270 +                       /* This is subtle. If we hit the corner case above
20271 +                        * It is possible that we get preempted right here,
20272 +                        * and another task has successfully called
20273 +                        * tasklet_schedule(), then this function, and
20274 +                        * failed on the trylock. Thus we must be sure
20275 +                        * before releasing the tasklet lock, that the
20276 +                        * SCHED_BIT is clear. Otherwise the tasklet
20277 +                        * may get its SCHED_BIT set, but not added to the
20278 +                        * list
20279 +                        */
20280 +                       if (!tasklet_tryunlock(t))
20281 +                               goto again;
20282 +               }
20283 +       }
20284 +}
20285 +
20286  void __tasklet_schedule(struct tasklet_struct *t)
20287  {
20288         unsigned long flags;
20289
20290         local_irq_save(flags);
20291 -       t->next = NULL;
20292 -       *__this_cpu_read(tasklet_vec.tail) = t;
20293 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
20294 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
20295 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
20296         local_irq_restore(flags);
20297  }
20298  EXPORT_SYMBOL(__tasklet_schedule);
20299 @@ -478,10 +938,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
20300         unsigned long flags;
20301
20302         local_irq_save(flags);
20303 -       t->next = NULL;
20304 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
20305 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
20306 -       raise_softirq_irqoff(HI_SOFTIRQ);
20307 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
20308         local_irq_restore(flags);
20309  }
20310  EXPORT_SYMBOL(__tasklet_hi_schedule);
20311 @@ -490,82 +947,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
20312  {
20313         BUG_ON(!irqs_disabled());
20314
20315 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
20316 -       __this_cpu_write(tasklet_hi_vec.head, t);
20317 -       __raise_softirq_irqoff(HI_SOFTIRQ);
20318 +       __tasklet_hi_schedule(t);
20319  }
20320  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
20321
20322 -static __latent_entropy void tasklet_action(struct softirq_action *a)
20323 +void  tasklet_enable(struct tasklet_struct *t)
20324  {
20325 -       struct tasklet_struct *list;
20326 +       if (!atomic_dec_and_test(&t->count))
20327 +               return;
20328 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
20329 +               tasklet_schedule(t);
20330 +}
20331 +EXPORT_SYMBOL(tasklet_enable);
20332
20333 -       local_irq_disable();
20334 -       list = __this_cpu_read(tasklet_vec.head);
20335 -       __this_cpu_write(tasklet_vec.head, NULL);
20336 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
20337 -       local_irq_enable();
20338 +static void __tasklet_action(struct softirq_action *a,
20339 +                            struct tasklet_struct *list)
20340 +{
20341 +       int loops = 1000000;
20342
20343         while (list) {
20344                 struct tasklet_struct *t = list;
20345
20346                 list = list->next;
20347
20348 -               if (tasklet_trylock(t)) {
20349 -                       if (!atomic_read(&t->count)) {
20350 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
20351 -                                                       &t->state))
20352 -                                       BUG();
20353 -                               t->func(t->data);
20354 -                               tasklet_unlock(t);
20355 -                               continue;
20356 -                       }
20357 -                       tasklet_unlock(t);
20358 +               /*
20359 +                * Should always succeed - after a tasklist got on the
20360 +                * list (after getting the SCHED bit set from 0 to 1),
20361 +                * nothing but the tasklet softirq it got queued to can
20362 +                * lock it:
20363 +                */
20364 +               if (!tasklet_trylock(t)) {
20365 +                       WARN_ON(1);
20366 +                       continue;
20367                 }
20368
20369 -               local_irq_disable();
20370                 t->next = NULL;
20371 -               *__this_cpu_read(tasklet_vec.tail) = t;
20372 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
20373 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
20374 -               local_irq_enable();
20375 +
20376 +               /*
20377 +                * If we cannot handle the tasklet because it's disabled,
20378 +                * mark it as pending. tasklet_enable() will later
20379 +                * re-schedule the tasklet.
20380 +                */
20381 +               if (unlikely(atomic_read(&t->count))) {
20382 +out_disabled:
20383 +                       /* implicit unlock: */
20384 +                       wmb();
20385 +                       t->state = TASKLET_STATEF_PENDING;
20386 +                       continue;
20387 +               }
20388 +
20389 +               /*
20390 +                * After this point on the tasklet might be rescheduled
20391 +                * on another CPU, but it can only be added to another
20392 +                * CPU's tasklet list if we unlock the tasklet (which we
20393 +                * dont do yet).
20394 +                */
20395 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
20396 +                       WARN_ON(1);
20397 +
20398 +again:
20399 +               t->func(t->data);
20400 +
20401 +               /*
20402 +                * Try to unlock the tasklet. We must use cmpxchg, because
20403 +                * another CPU might have scheduled or disabled the tasklet.
20404 +                * We only allow the STATE_RUN -> 0 transition here.
20405 +                */
20406 +               while (!tasklet_tryunlock(t)) {
20407 +                       /*
20408 +                        * If it got disabled meanwhile, bail out:
20409 +                        */
20410 +                       if (atomic_read(&t->count))
20411 +                               goto out_disabled;
20412 +                       /*
20413 +                        * If it got scheduled meanwhile, re-execute
20414 +                        * the tasklet function:
20415 +                        */
20416 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
20417 +                               goto again;
20418 +                       if (!--loops) {
20419 +                               printk("hm, tasklet state: %08lx\n", t->state);
20420 +                               WARN_ON(1);
20421 +                               tasklet_unlock(t);
20422 +                               break;
20423 +                       }
20424 +               }
20425         }
20426  }
20427
20428 +static void tasklet_action(struct softirq_action *a)
20429 +{
20430 +       struct tasklet_struct *list;
20431 +
20432 +       local_irq_disable();
20433 +
20434 +       list = __this_cpu_read(tasklet_vec.head);
20435 +       __this_cpu_write(tasklet_vec.head, NULL);
20436 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
20437 +
20438 +       local_irq_enable();
20439 +
20440 +       __tasklet_action(a, list);
20441 +}
20442 +
20443  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
20444  {
20445         struct tasklet_struct *list;
20446
20447         local_irq_disable();
20448 +
20449         list = __this_cpu_read(tasklet_hi_vec.head);
20450         __this_cpu_write(tasklet_hi_vec.head, NULL);
20451         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
20452 -       local_irq_enable();
20453 -
20454 -       while (list) {
20455 -               struct tasklet_struct *t = list;
20456
20457 -               list = list->next;
20458 -
20459 -               if (tasklet_trylock(t)) {
20460 -                       if (!atomic_read(&t->count)) {
20461 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
20462 -                                                       &t->state))
20463 -                                       BUG();
20464 -                               t->func(t->data);
20465 -                               tasklet_unlock(t);
20466 -                               continue;
20467 -                       }
20468 -                       tasklet_unlock(t);
20469 -               }
20470 +       local_irq_enable();
20471
20472 -               local_irq_disable();
20473 -               t->next = NULL;
20474 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
20475 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
20476 -               __raise_softirq_irqoff(HI_SOFTIRQ);
20477 -               local_irq_enable();
20478 -       }
20479 +       __tasklet_action(a, list);
20480  }
20481
20482  void tasklet_init(struct tasklet_struct *t,
20483 @@ -586,7 +1083,7 @@ void tasklet_kill(struct tasklet_struct *t)
20484
20485         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
20486                 do {
20487 -                       yield();
20488 +                       msleep(1);
20489                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
20490         }
20491         tasklet_unlock_wait(t);
20492 @@ -660,25 +1157,26 @@ void __init softirq_init(void)
20493         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
20494  }
20495
20496 -static int ksoftirqd_should_run(unsigned int cpu)
20497 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
20498 +void tasklet_unlock_wait(struct tasklet_struct *t)
20499  {
20500 -       return local_softirq_pending();
20501 -}
20502 -
20503 -static void run_ksoftirqd(unsigned int cpu)
20504 -{
20505 -       local_irq_disable();
20506 -       if (local_softirq_pending()) {
20507 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
20508                 /*
20509 -                * We can safely run softirq on inline stack, as we are not deep
20510 -                * in the task stack here.
20511 +                * Hack for now to avoid this busy-loop:
20512                  */
20513 -               __do_softirq();
20514 -               local_irq_enable();
20515 -               cond_resched_rcu_qs();
20516 -               return;
20517 +#ifdef CONFIG_PREEMPT_RT_FULL
20518 +               msleep(1);
20519 +#else
20520 +               barrier();
20521 +#endif
20522         }
20523 -       local_irq_enable();
20524 +}
20525 +EXPORT_SYMBOL(tasklet_unlock_wait);
20526 +#endif
20527 +
20528 +static int ksoftirqd_should_run(unsigned int cpu)
20529 +{
20530 +       return ksoftirqd_softirq_pending();
20531  }
20532
20533  #ifdef CONFIG_HOTPLUG_CPU
20534 @@ -745,17 +1243,31 @@ static int takeover_tasklets(unsigned int cpu)
20535
20536  static struct smp_hotplug_thread softirq_threads = {
20537         .store                  = &ksoftirqd,
20538 +       .setup                  = ksoftirqd_set_sched_params,
20539         .thread_should_run      = ksoftirqd_should_run,
20540         .thread_fn              = run_ksoftirqd,
20541         .thread_comm            = "ksoftirqd/%u",
20542  };
20543
20544 +#ifdef CONFIG_PREEMPT_RT_FULL
20545 +static struct smp_hotplug_thread softirq_timer_threads = {
20546 +       .store                  = &ktimer_softirqd,
20547 +       .setup                  = ktimer_softirqd_set_sched_params,
20548 +       .cleanup                = ktimer_softirqd_clr_sched_params,
20549 +       .thread_should_run      = ktimer_softirqd_should_run,
20550 +       .thread_fn              = run_ksoftirqd,
20551 +       .thread_comm            = "ktimersoftd/%u",
20552 +};
20553 +#endif
20554 +
20555  static __init int spawn_ksoftirqd(void)
20556  {
20557         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
20558                                   takeover_tasklets);
20559         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
20560 -
20561 +#ifdef CONFIG_PREEMPT_RT_FULL
20562 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
20563 +#endif
20564         return 0;
20565  }
20566  early_initcall(spawn_ksoftirqd);
20567 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
20568 index ec9ab2f01489..8b89dbedeaff 100644
20569 --- a/kernel/stop_machine.c
20570 +++ b/kernel/stop_machine.c
20571 @@ -36,7 +36,7 @@ struct cpu_stop_done {
20572  struct cpu_stopper {
20573         struct task_struct      *thread;
20574
20575 -       spinlock_t              lock;
20576 +       raw_spinlock_t          lock;
20577         bool                    enabled;        /* is this stopper enabled? */
20578         struct list_head        works;          /* list of pending works */
20579
20580 @@ -78,14 +78,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
20581         unsigned long flags;
20582         bool enabled;
20583
20584 -       spin_lock_irqsave(&stopper->lock, flags);
20585 +       raw_spin_lock_irqsave(&stopper->lock, flags);
20586         enabled = stopper->enabled;
20587         if (enabled)
20588                 __cpu_stop_queue_work(stopper, work);
20589         else if (work->done)
20590                 cpu_stop_signal_done(work->done);
20591 -       spin_unlock_irqrestore(&stopper->lock, flags);
20592
20593 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
20594         return enabled;
20595  }
20596
20597 @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
20598         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
20599         int err;
20600  retry:
20601 -       spin_lock_irq(&stopper1->lock);
20602 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
20603 +       raw_spin_lock_irq(&stopper1->lock);
20604 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
20605
20606         err = -ENOENT;
20607         if (!stopper1->enabled || !stopper2->enabled)
20608 @@ -255,8 +255,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
20609         __cpu_stop_queue_work(stopper1, work1);
20610         __cpu_stop_queue_work(stopper2, work2);
20611  unlock:
20612 -       spin_unlock(&stopper2->lock);
20613 -       spin_unlock_irq(&stopper1->lock);
20614 +       raw_spin_unlock(&stopper2->lock);
20615 +       raw_spin_unlock_irq(&stopper1->lock);
20616
20617         if (unlikely(err == -EDEADLK)) {
20618                 while (stop_cpus_in_progress)
20619 @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
20620         unsigned long flags;
20621         int run;
20622
20623 -       spin_lock_irqsave(&stopper->lock, flags);
20624 +       raw_spin_lock_irqsave(&stopper->lock, flags);
20625         run = !list_empty(&stopper->works);
20626 -       spin_unlock_irqrestore(&stopper->lock, flags);
20627 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
20628         return run;
20629  }
20630
20631 @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
20632
20633  repeat:
20634         work = NULL;
20635 -       spin_lock_irq(&stopper->lock);
20636 +       raw_spin_lock_irq(&stopper->lock);
20637         if (!list_empty(&stopper->works)) {
20638                 work = list_first_entry(&stopper->works,
20639                                         struct cpu_stop_work, list);
20640                 list_del_init(&work->list);
20641         }
20642 -       spin_unlock_irq(&stopper->lock);
20643 +       raw_spin_unlock_irq(&stopper->lock);
20644
20645         if (work) {
20646                 cpu_stop_fn_t fn = work->fn;
20647 @@ -475,6 +475,8 @@ static void cpu_stopper_thread(unsigned int cpu)
20648                 struct cpu_stop_done *done = work->done;
20649                 int ret;
20650
20651 +               /* XXX */
20652 +
20653                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
20654                 preempt_count_inc();
20655                 ret = fn(arg);
20656 @@ -541,7 +543,7 @@ static int __init cpu_stop_init(void)
20657         for_each_possible_cpu(cpu) {
20658                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
20659
20660 -               spin_lock_init(&stopper->lock);
20661 +               raw_spin_lock_init(&stopper->lock);
20662                 INIT_LIST_HEAD(&stopper->works);
20663         }
20664
20665 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
20666 index eeb7f2f5698d..369203af6406 100644
20667 --- a/kernel/time/hrtimer.c
20668 +++ b/kernel/time/hrtimer.c
20669 @@ -53,6 +53,7 @@
20670  #include <asm/uaccess.h>
20671
20672  #include <trace/events/timer.h>
20673 +#include <trace/events/hist.h>
20674
20675  #include "tick-internal.h"
20676
20677 @@ -693,6 +694,29 @@ static void hrtimer_switch_to_hres(void)
20678         retrigger_next_event(NULL);
20679  }
20680
20681 +#ifdef CONFIG_PREEMPT_RT_FULL
20682 +
20683 +static struct swork_event clock_set_delay_work;
20684 +
20685 +static void run_clock_set_delay(struct swork_event *event)
20686 +{
20687 +       clock_was_set();
20688 +}
20689 +
20690 +void clock_was_set_delayed(void)
20691 +{
20692 +       swork_queue(&clock_set_delay_work);
20693 +}
20694 +
20695 +static __init int create_clock_set_delay_thread(void)
20696 +{
20697 +       WARN_ON(swork_get());
20698 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
20699 +       return 0;
20700 +}
20701 +early_initcall(create_clock_set_delay_thread);
20702 +#else /* PREEMPT_RT_FULL */
20703 +
20704  static void clock_was_set_work(struct work_struct *work)
20705  {
20706         clock_was_set();
20707 @@ -708,6 +732,7 @@ void clock_was_set_delayed(void)
20708  {
20709         schedule_work(&hrtimer_work);
20710  }
20711 +#endif
20712
20713  #else
20714
20715 @@ -717,11 +742,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
20716  static inline void hrtimer_switch_to_hres(void) { }
20717  static inline void
20718  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
20719 -static inline int hrtimer_reprogram(struct hrtimer *timer,
20720 -                                   struct hrtimer_clock_base *base)
20721 -{
20722 -       return 0;
20723 -}
20724 +static inline void hrtimer_reprogram(struct hrtimer *timer,
20725 +                                    struct hrtimer_clock_base *base) { }
20726  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
20727  static inline void retrigger_next_event(void *arg) { }
20728
20729 @@ -853,6 +875,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
20730  }
20731  EXPORT_SYMBOL_GPL(hrtimer_forward);
20732
20733 +#ifdef CONFIG_PREEMPT_RT_BASE
20734 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
20735 +
20736 +/**
20737 + * hrtimer_wait_for_timer - Wait for a running timer
20738 + *
20739 + * @timer:     timer to wait for
20740 + *
20741 + * The function waits in case the timers callback function is
20742 + * currently executed on the waitqueue of the timer base. The
20743 + * waitqueue is woken up after the timer callback function has
20744 + * finished execution.
20745 + */
20746 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
20747 +{
20748 +       struct hrtimer_clock_base *base = timer->base;
20749 +
20750 +       if (base && base->cpu_base && !timer->irqsafe)
20751 +               wait_event(base->cpu_base->wait,
20752 +                               !(hrtimer_callback_running(timer)));
20753 +}
20754 +
20755 +#else
20756 +# define wake_up_timer_waiters(b)      do { } while (0)
20757 +#endif
20758 +
20759  /*
20760   * enqueue_hrtimer - internal function to (re)start a timer
20761   *
20762 @@ -894,6 +942,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
20763         if (!(state & HRTIMER_STATE_ENQUEUED))
20764                 return;
20765
20766 +       if (unlikely(!list_empty(&timer->cb_entry))) {
20767 +               list_del_init(&timer->cb_entry);
20768 +               return;
20769 +       }
20770 +
20771         if (!timerqueue_del(&base->active, &timer->node))
20772                 cpu_base->active_bases &= ~(1 << base->index);
20773
20774 @@ -989,7 +1042,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
20775         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
20776
20777         timer_stats_hrtimer_set_start_info(timer);
20778 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20779 +       {
20780 +               ktime_t now = new_base->get_time();
20781
20782 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
20783 +                       timer->praecox = now;
20784 +               else
20785 +                       timer->praecox = ktime_set(0, 0);
20786 +       }
20787 +#endif
20788         leftmost = enqueue_hrtimer(timer, new_base);
20789         if (!leftmost)
20790                 goto unlock;
20791 @@ -1061,7 +1123,7 @@ int hrtimer_cancel(struct hrtimer *timer)
20792
20793                 if (ret >= 0)
20794                         return ret;
20795 -               cpu_relax();
20796 +               hrtimer_wait_for_timer(timer);
20797         }
20798  }
20799  EXPORT_SYMBOL_GPL(hrtimer_cancel);
20800 @@ -1137,6 +1199,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
20801
20802         base = hrtimer_clockid_to_base(clock_id);
20803         timer->base = &cpu_base->clock_base[base];
20804 +       INIT_LIST_HEAD(&timer->cb_entry);
20805         timerqueue_init(&timer->node);
20806
20807  #ifdef CONFIG_TIMER_STATS
20808 @@ -1177,6 +1240,7 @@ bool hrtimer_active(const struct hrtimer *timer)
20809                 seq = raw_read_seqcount_begin(&cpu_base->seq);
20810
20811                 if (timer->state != HRTIMER_STATE_INACTIVE ||
20812 +                   cpu_base->running_soft == timer ||
20813                     cpu_base->running == timer)
20814                         return true;
20815
20816 @@ -1275,10 +1339,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
20817         cpu_base->running = NULL;
20818  }
20819
20820 -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20821 +#ifdef CONFIG_PREEMPT_RT_BASE
20822 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
20823 +                                struct hrtimer_clock_base *base)
20824 +{
20825 +       int leftmost;
20826 +
20827 +       if (restart != HRTIMER_NORESTART &&
20828 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
20829 +
20830 +               leftmost = enqueue_hrtimer(timer, base);
20831 +               if (!leftmost)
20832 +                       return;
20833 +#ifdef CONFIG_HIGH_RES_TIMERS
20834 +               if (!hrtimer_is_hres_active(timer)) {
20835 +                       /*
20836 +                        * Kick to reschedule the next tick to handle the new timer
20837 +                        * on dynticks target.
20838 +                        */
20839 +                       if (base->cpu_base->nohz_active)
20840 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
20841 +               } else {
20842 +
20843 +                       hrtimer_reprogram(timer, base);
20844 +               }
20845 +#endif
20846 +       }
20847 +}
20848 +
20849 +/*
20850 + * The changes in mainline which removed the callback modes from
20851 + * hrtimer are not yet working with -rt. The non wakeup_process()
20852 + * based callbacks which involve sleeping locks need to be treated
20853 + * seperately.
20854 + */
20855 +static void hrtimer_rt_run_pending(void)
20856 +{
20857 +       enum hrtimer_restart (*fn)(struct hrtimer *);
20858 +       struct hrtimer_cpu_base *cpu_base;
20859 +       struct hrtimer_clock_base *base;
20860 +       struct hrtimer *timer;
20861 +       int index, restart;
20862 +
20863 +       local_irq_disable();
20864 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
20865 +
20866 +       raw_spin_lock(&cpu_base->lock);
20867 +
20868 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
20869 +               base = &cpu_base->clock_base[index];
20870 +
20871 +               while (!list_empty(&base->expired)) {
20872 +                       timer = list_first_entry(&base->expired,
20873 +                                                struct hrtimer, cb_entry);
20874 +
20875 +                       /*
20876 +                        * Same as the above __run_hrtimer function
20877 +                        * just we run with interrupts enabled.
20878 +                        */
20879 +                       debug_deactivate(timer);
20880 +                       cpu_base->running_soft = timer;
20881 +                       raw_write_seqcount_barrier(&cpu_base->seq);
20882 +
20883 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
20884 +                       timer_stats_account_hrtimer(timer);
20885 +                       fn = timer->function;
20886 +
20887 +                       raw_spin_unlock_irq(&cpu_base->lock);
20888 +                       restart = fn(timer);
20889 +                       raw_spin_lock_irq(&cpu_base->lock);
20890 +
20891 +                       hrtimer_rt_reprogram(restart, timer, base);
20892 +                       raw_write_seqcount_barrier(&cpu_base->seq);
20893 +
20894 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
20895 +                       cpu_base->running_soft = NULL;
20896 +               }
20897 +       }
20898 +
20899 +       raw_spin_unlock_irq(&cpu_base->lock);
20900 +
20901 +       wake_up_timer_waiters(cpu_base);
20902 +}
20903 +
20904 +static int hrtimer_rt_defer(struct hrtimer *timer)
20905 +{
20906 +       if (timer->irqsafe)
20907 +               return 0;
20908 +
20909 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
20910 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
20911 +       return 1;
20912 +}
20913 +
20914 +#else
20915 +
20916 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
20917 +
20918 +#endif
20919 +
20920 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
20921 +
20922 +static int __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20923  {
20924         struct hrtimer_clock_base *base = cpu_base->clock_base;
20925         unsigned int active = cpu_base->active_bases;
20926 +       int raise = 0;
20927
20928         for (; active; base++, active >>= 1) {
20929                 struct timerqueue_node *node;
20930 @@ -1294,6 +1460,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20931
20932                         timer = container_of(node, struct hrtimer, node);
20933
20934 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
20935 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
20936 +                               timer->praecox : hrtimer_get_expires(timer),
20937 +                               basenow)),
20938 +                           current,
20939 +                           timer->function == hrtimer_wakeup ?
20940 +                           container_of(timer, struct hrtimer_sleeper,
20941 +                               timer)->task : NULL);
20942 +
20943                         /*
20944                          * The immediate goal for using the softexpires is
20945                          * minimizing wakeups, not running timers at the
20946 @@ -1309,9 +1484,13 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20947                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
20948                                 break;
20949
20950 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
20951 +                       if (!hrtimer_rt_defer(timer))
20952 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
20953 +                       else
20954 +                               raise = 1;
20955                 }
20956         }
20957 +       return raise;
20958  }
20959
20960  #ifdef CONFIG_HIGH_RES_TIMERS
20961 @@ -1325,6 +1504,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
20962         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
20963         ktime_t expires_next, now, entry_time, delta;
20964         int retries = 0;
20965 +       int raise;
20966
20967         BUG_ON(!cpu_base->hres_active);
20968         cpu_base->nr_events++;
20969 @@ -1343,7 +1523,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
20970          */
20971         cpu_base->expires_next.tv64 = KTIME_MAX;
20972
20973 -       __hrtimer_run_queues(cpu_base, now);
20974 +       raise = __hrtimer_run_queues(cpu_base, now);
20975
20976         /* Reevaluate the clock bases for the next expiry */
20977         expires_next = __hrtimer_get_next_event(cpu_base);
20978 @@ -1354,6 +1534,8 @@ void hrtimer_interrupt(struct clock_event_device *dev)
20979         cpu_base->expires_next = expires_next;
20980         cpu_base->in_hrtirq = 0;
20981         raw_spin_unlock(&cpu_base->lock);
20982 +       if (raise)
20983 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
20984
20985         /* Reprogramming necessary ? */
20986         if (!tick_program_event(expires_next, 0)) {
20987 @@ -1433,6 +1615,7 @@ void hrtimer_run_queues(void)
20988  {
20989         struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
20990         ktime_t now;
20991 +       int raise;
20992
20993         if (__hrtimer_hres_active(cpu_base))
20994                 return;
20995 @@ -1451,8 +1634,10 @@ void hrtimer_run_queues(void)
20996
20997         raw_spin_lock(&cpu_base->lock);
20998         now = hrtimer_update_base(cpu_base);
20999 -       __hrtimer_run_queues(cpu_base, now);
21000 +       raise = __hrtimer_run_queues(cpu_base, now);
21001         raw_spin_unlock(&cpu_base->lock);
21002 +       if (raise)
21003 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
21004  }
21005
21006  /*
21007 @@ -1474,16 +1659,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
21008  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
21009  {
21010         sl->timer.function = hrtimer_wakeup;
21011 +       sl->timer.irqsafe = 1;
21012         sl->task = task;
21013  }
21014  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
21015
21016 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
21017 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
21018 +                               unsigned long state)
21019  {
21020         hrtimer_init_sleeper(t, current);
21021
21022         do {
21023 -               set_current_state(TASK_INTERRUPTIBLE);
21024 +               set_current_state(state);
21025                 hrtimer_start_expires(&t->timer, mode);
21026
21027                 if (likely(t->task))
21028 @@ -1525,7 +1712,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
21029                                 HRTIMER_MODE_ABS);
21030         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
21031
21032 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
21033 +       /* cpu_chill() does not care about restart state. */
21034 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
21035                 goto out;
21036
21037         rmtp = restart->nanosleep.rmtp;
21038 @@ -1542,8 +1730,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
21039         return ret;
21040  }
21041
21042 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21043 -                      const enum hrtimer_mode mode, const clockid_t clockid)
21044 +static long
21045 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21046 +                   const enum hrtimer_mode mode, const clockid_t clockid,
21047 +                   unsigned long state)
21048  {
21049         struct restart_block *restart;
21050         struct hrtimer_sleeper t;
21051 @@ -1556,7 +1746,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21052
21053         hrtimer_init_on_stack(&t.timer, clockid, mode);
21054         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
21055 -       if (do_nanosleep(&t, mode))
21056 +       if (do_nanosleep(&t, mode, state))
21057                 goto out;
21058
21059         /* Absolute timers do not update the rmtp value and restart: */
21060 @@ -1583,6 +1773,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21061         return ret;
21062  }
21063
21064 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
21065 +                      const enum hrtimer_mode mode, const clockid_t clockid)
21066 +{
21067 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
21068 +}
21069 +
21070  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
21071                 struct timespec __user *, rmtp)
21072  {
21073 @@ -1597,6 +1793,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
21074         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
21075  }
21076
21077 +#ifdef CONFIG_PREEMPT_RT_FULL
21078 +/*
21079 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
21080 + */
21081 +void cpu_chill(void)
21082 +{
21083 +       struct timespec tu = {
21084 +               .tv_nsec = NSEC_PER_MSEC,
21085 +       };
21086 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
21087 +
21088 +       current->flags |= PF_NOFREEZE;
21089 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
21090 +                           TASK_UNINTERRUPTIBLE);
21091 +       if (!freeze_flag)
21092 +               current->flags &= ~PF_NOFREEZE;
21093 +}
21094 +EXPORT_SYMBOL(cpu_chill);
21095 +#endif
21096 +
21097  /*
21098   * Functions related to boot-time initialization:
21099   */
21100 @@ -1608,16 +1824,20 @@ int hrtimers_prepare_cpu(unsigned int cpu)
21101         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
21102                 cpu_base->clock_base[i].cpu_base = cpu_base;
21103                 timerqueue_init_head(&cpu_base->clock_base[i].active);
21104 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
21105         }
21106
21107         cpu_base->cpu = cpu;
21108         hrtimer_init_hres(cpu_base);
21109 +#ifdef CONFIG_PREEMPT_RT_BASE
21110 +       init_waitqueue_head(&cpu_base->wait);
21111 +#endif
21112         return 0;
21113  }
21114
21115  #ifdef CONFIG_HOTPLUG_CPU
21116
21117 -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
21118 +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
21119                                 struct hrtimer_clock_base *new_base)
21120  {
21121         struct hrtimer *timer;
21122 @@ -1645,12 +1865,21 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
21123                  */
21124                 enqueue_hrtimer(timer, new_base);
21125         }
21126 +#ifdef CONFIG_PREEMPT_RT_BASE
21127 +       list_splice_tail(&old_base->expired, &new_base->expired);
21128 +       /*
21129 +        * Tell the caller to raise HRTIMER_SOFTIRQ.  We can't safely
21130 +        * acquire ktimersoftd->pi_lock while the base lock is held.
21131 +        */
21132 +       return !list_empty(&new_base->expired);
21133 +#endif
21134 +       return 0;
21135  }
21136
21137  int hrtimers_dead_cpu(unsigned int scpu)
21138  {
21139         struct hrtimer_cpu_base *old_base, *new_base;
21140 -       int i;
21141 +       int i, raise = 0;
21142
21143         BUG_ON(cpu_online(scpu));
21144         tick_cancel_sched_timer(scpu);
21145 @@ -1666,13 +1895,16 @@ int hrtimers_dead_cpu(unsigned int scpu)
21146         raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
21147
21148         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
21149 -               migrate_hrtimer_list(&old_base->clock_base[i],
21150 -                                    &new_base->clock_base[i]);
21151 +               raise |= migrate_hrtimer_list(&old_base->clock_base[i],
21152 +                                             &new_base->clock_base[i]);
21153         }
21154
21155         raw_spin_unlock(&old_base->lock);
21156         raw_spin_unlock(&new_base->lock);
21157
21158 +       if (raise)
21159 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
21160 +
21161         /* Check, if we got expired work to do */
21162         __hrtimer_peek_ahead_timers();
21163         local_irq_enable();
21164 @@ -1681,9 +1913,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
21165
21166  #endif /* CONFIG_HOTPLUG_CPU */
21167
21168 +#ifdef CONFIG_PREEMPT_RT_BASE
21169 +
21170 +static void run_hrtimer_softirq(struct softirq_action *h)
21171 +{
21172 +       hrtimer_rt_run_pending();
21173 +}
21174 +
21175 +static void hrtimers_open_softirq(void)
21176 +{
21177 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
21178 +}
21179 +
21180 +#else
21181 +static void hrtimers_open_softirq(void) { }
21182 +#endif
21183 +
21184  void __init hrtimers_init(void)
21185  {
21186         hrtimers_prepare_cpu(smp_processor_id());
21187 +       hrtimers_open_softirq();
21188  }
21189
21190  /**
21191 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
21192 index 1d5c7204ddc9..184de6751180 100644
21193 --- a/kernel/time/itimer.c
21194 +++ b/kernel/time/itimer.c
21195 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
21196                 /* We are sharing ->siglock with it_real_fn() */
21197                 if (hrtimer_try_to_cancel(timer) < 0) {
21198                         spin_unlock_irq(&tsk->sighand->siglock);
21199 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
21200                         goto again;
21201                 }
21202                 expires = timeval_to_ktime(value->it_value);
21203 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
21204 index 555e21f7b966..a5d6435fabbb 100644
21205 --- a/kernel/time/jiffies.c
21206 +++ b/kernel/time/jiffies.c
21207 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
21208         .max_cycles     = 10,
21209  };
21210
21211 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
21212 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
21213 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
21214
21215  #if (BITS_PER_LONG < 64)
21216  u64 get_jiffies_64(void)
21217 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
21218         u64 ret;
21219
21220         do {
21221 -               seq = read_seqbegin(&jiffies_lock);
21222 +               seq = read_seqcount_begin(&jiffies_seq);
21223                 ret = jiffies_64;
21224 -       } while (read_seqretry(&jiffies_lock, seq));
21225 +       } while (read_seqcount_retry(&jiffies_seq, seq));
21226         return ret;
21227  }
21228  EXPORT_SYMBOL(get_jiffies_64);
21229 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
21230 index 6df8927c58a5..05b7391bf9bd 100644
21231 --- a/kernel/time/ntp.c
21232 +++ b/kernel/time/ntp.c
21233 @@ -17,6 +17,7 @@
21234  #include <linux/module.h>
21235  #include <linux/rtc.h>
21236  #include <linux/math64.h>
21237 +#include <linux/swork.h>
21238
21239  #include "ntp_internal.h"
21240  #include "timekeeping_internal.h"
21241 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
21242                            &sync_cmos_work, timespec64_to_jiffies(&next));
21243  }
21244
21245 +#ifdef CONFIG_PREEMPT_RT_FULL
21246 +
21247 +static void run_clock_set_delay(struct swork_event *event)
21248 +{
21249 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
21250 +}
21251 +
21252 +static struct swork_event ntp_cmos_swork;
21253 +
21254 +void ntp_notify_cmos_timer(void)
21255 +{
21256 +       swork_queue(&ntp_cmos_swork);
21257 +}
21258 +
21259 +static __init int create_cmos_delay_thread(void)
21260 +{
21261 +       WARN_ON(swork_get());
21262 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
21263 +       return 0;
21264 +}
21265 +early_initcall(create_cmos_delay_thread);
21266 +
21267 +#else
21268 +
21269  void ntp_notify_cmos_timer(void)
21270  {
21271         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
21272  }
21273 +#endif /* CONFIG_PREEMPT_RT_FULL */
21274
21275  #else
21276  void ntp_notify_cmos_timer(void) { }
21277 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
21278 index 39008d78927a..633f4eaca9e7 100644
21279 --- a/kernel/time/posix-cpu-timers.c
21280 +++ b/kernel/time/posix-cpu-timers.c
21281 @@ -3,6 +3,7 @@
21282   */
21283
21284  #include <linux/sched.h>
21285 +#include <linux/sched/rt.h>
21286  #include <linux/posix-timers.h>
21287  #include <linux/errno.h>
21288  #include <linux/math64.h>
21289 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
21290         /*
21291          * Disarm any old timer after extracting its expiry time.
21292          */
21293 -       WARN_ON_ONCE(!irqs_disabled());
21294 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21295
21296         ret = 0;
21297         old_incr = timer->it.cpu.incr;
21298 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
21299         /*
21300          * Now re-arm for the new expiry time.
21301          */
21302 -       WARN_ON_ONCE(!irqs_disabled());
21303 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21304         arm_timer(timer);
21305         unlock_task_sighand(p, &flags);
21306
21307 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
21308   * already updated our counts.  We need to check if any timers fire now.
21309   * Interrupts are disabled.
21310   */
21311 -void run_posix_cpu_timers(struct task_struct *tsk)
21312 +static void __run_posix_cpu_timers(struct task_struct *tsk)
21313  {
21314         LIST_HEAD(firing);
21315         struct k_itimer *timer, *next;
21316         unsigned long flags;
21317
21318 -       WARN_ON_ONCE(!irqs_disabled());
21319 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21320
21321         /*
21322          * The fast path checks that there are no expired thread or thread
21323 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
21324         }
21325  }
21326
21327 +#ifdef CONFIG_PREEMPT_RT_BASE
21328 +#include <linux/kthread.h>
21329 +#include <linux/cpu.h>
21330 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
21331 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
21332 +
21333 +static int posix_cpu_timers_thread(void *data)
21334 +{
21335 +       int cpu = (long)data;
21336 +
21337 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
21338 +
21339 +       while (!kthread_should_stop()) {
21340 +               struct task_struct *tsk = NULL;
21341 +               struct task_struct *next = NULL;
21342 +
21343 +               if (cpu_is_offline(cpu))
21344 +                       goto wait_to_die;
21345 +
21346 +               /* grab task list */
21347 +               raw_local_irq_disable();
21348 +               tsk = per_cpu(posix_timer_tasklist, cpu);
21349 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
21350 +               raw_local_irq_enable();
21351 +
21352 +               /* its possible the list is empty, just return */
21353 +               if (!tsk) {
21354 +                       set_current_state(TASK_INTERRUPTIBLE);
21355 +                       schedule();
21356 +                       __set_current_state(TASK_RUNNING);
21357 +                       continue;
21358 +               }
21359 +
21360 +               /* Process task list */
21361 +               while (1) {
21362 +                       /* save next */
21363 +                       next = tsk->posix_timer_list;
21364 +
21365 +                       /* run the task timers, clear its ptr and
21366 +                        * unreference it
21367 +                        */
21368 +                       __run_posix_cpu_timers(tsk);
21369 +                       tsk->posix_timer_list = NULL;
21370 +                       put_task_struct(tsk);
21371 +
21372 +                       /* check if this is the last on the list */
21373 +                       if (next == tsk)
21374 +                               break;
21375 +                       tsk = next;
21376 +               }
21377 +       }
21378 +       return 0;
21379 +
21380 +wait_to_die:
21381 +       /* Wait for kthread_stop */
21382 +       set_current_state(TASK_INTERRUPTIBLE);
21383 +       while (!kthread_should_stop()) {
21384 +               schedule();
21385 +               set_current_state(TASK_INTERRUPTIBLE);
21386 +       }
21387 +       __set_current_state(TASK_RUNNING);
21388 +       return 0;
21389 +}
21390 +
21391 +static inline int __fastpath_timer_check(struct task_struct *tsk)
21392 +{
21393 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
21394 +       if (unlikely(tsk->exit_state))
21395 +               return 0;
21396 +
21397 +       if (!task_cputime_zero(&tsk->cputime_expires))
21398 +                       return 1;
21399 +
21400 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
21401 +                       return 1;
21402 +
21403 +       return 0;
21404 +}
21405 +
21406 +void run_posix_cpu_timers(struct task_struct *tsk)
21407 +{
21408 +       unsigned long cpu = smp_processor_id();
21409 +       struct task_struct *tasklist;
21410 +
21411 +       BUG_ON(!irqs_disabled());
21412 +       if(!per_cpu(posix_timer_task, cpu))
21413 +               return;
21414 +       /* get per-cpu references */
21415 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
21416 +
21417 +       /* check to see if we're already queued */
21418 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
21419 +               get_task_struct(tsk);
21420 +               if (tasklist) {
21421 +                       tsk->posix_timer_list = tasklist;
21422 +               } else {
21423 +                       /*
21424 +                        * The list is terminated by a self-pointing
21425 +                        * task_struct
21426 +                        */
21427 +                       tsk->posix_timer_list = tsk;
21428 +               }
21429 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
21430 +
21431 +               wake_up_process(per_cpu(posix_timer_task, cpu));
21432 +       }
21433 +}
21434 +
21435 +/*
21436 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
21437 + * Here we can start up the necessary migration thread for the new CPU.
21438 + */
21439 +static int posix_cpu_thread_call(struct notifier_block *nfb,
21440 +                                unsigned long action, void *hcpu)
21441 +{
21442 +       int cpu = (long)hcpu;
21443 +       struct task_struct *p;
21444 +       struct sched_param param;
21445 +
21446 +       switch (action) {
21447 +       case CPU_UP_PREPARE:
21448 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
21449 +                                       "posixcputmr/%d",cpu);
21450 +               if (IS_ERR(p))
21451 +                       return NOTIFY_BAD;
21452 +               p->flags |= PF_NOFREEZE;
21453 +               kthread_bind(p, cpu);
21454 +               /* Must be high prio to avoid getting starved */
21455 +               param.sched_priority = MAX_RT_PRIO-1;
21456 +               sched_setscheduler(p, SCHED_FIFO, &param);
21457 +               per_cpu(posix_timer_task,cpu) = p;
21458 +               break;
21459 +       case CPU_ONLINE:
21460 +               /* Strictly unneccessary, as first user will wake it. */
21461 +               wake_up_process(per_cpu(posix_timer_task,cpu));
21462 +               break;
21463 +#ifdef CONFIG_HOTPLUG_CPU
21464 +       case CPU_UP_CANCELED:
21465 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
21466 +               kthread_bind(per_cpu(posix_timer_task, cpu),
21467 +                            cpumask_any(cpu_online_mask));
21468 +               kthread_stop(per_cpu(posix_timer_task,cpu));
21469 +               per_cpu(posix_timer_task,cpu) = NULL;
21470 +               break;
21471 +       case CPU_DEAD:
21472 +               kthread_stop(per_cpu(posix_timer_task,cpu));
21473 +               per_cpu(posix_timer_task,cpu) = NULL;
21474 +               break;
21475 +#endif
21476 +       }
21477 +       return NOTIFY_OK;
21478 +}
21479 +
21480 +/* Register at highest priority so that task migration (migrate_all_tasks)
21481 + * happens before everything else.
21482 + */
21483 +static struct notifier_block posix_cpu_thread_notifier = {
21484 +       .notifier_call = posix_cpu_thread_call,
21485 +       .priority = 10
21486 +};
21487 +
21488 +static int __init posix_cpu_thread_init(void)
21489 +{
21490 +       void *hcpu = (void *)(long)smp_processor_id();
21491 +       /* Start one for boot CPU. */
21492 +       unsigned long cpu;
21493 +
21494 +       /* init the per-cpu posix_timer_tasklets */
21495 +       for_each_possible_cpu(cpu)
21496 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
21497 +
21498 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
21499 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
21500 +       register_cpu_notifier(&posix_cpu_thread_notifier);
21501 +       return 0;
21502 +}
21503 +early_initcall(posix_cpu_thread_init);
21504 +#else /* CONFIG_PREEMPT_RT_BASE */
21505 +void run_posix_cpu_timers(struct task_struct *tsk)
21506 +{
21507 +       __run_posix_cpu_timers(tsk);
21508 +}
21509 +#endif /* CONFIG_PREEMPT_RT_BASE */
21510 +
21511  /*
21512   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
21513   * The tsk->sighand->siglock must be held by the caller.
21514 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
21515 index f2826c35e918..464a98155a0e 100644
21516 --- a/kernel/time/posix-timers.c
21517 +++ b/kernel/time/posix-timers.c
21518 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
21519  static struct pid *good_sigevent(sigevent_t * event)
21520  {
21521         struct task_struct *rtn = current->group_leader;
21522 +       int sig = event->sigev_signo;
21523
21524         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
21525                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
21526 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
21527                 return NULL;
21528
21529         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
21530 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
21531 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
21532 +            sig_kernel_coredump(sig)))
21533                 return NULL;
21534
21535         return task_pid(rtn);
21536 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
21537         return overrun;
21538  }
21539
21540 +/*
21541 + * Protected by RCU!
21542 + */
21543 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
21544 +{
21545 +#ifdef CONFIG_PREEMPT_RT_FULL
21546 +       if (kc->timer_set == common_timer_set)
21547 +               hrtimer_wait_for_timer(&timr->it.real.timer);
21548 +       else
21549 +               /* FIXME: Whacky hack for posix-cpu-timers */
21550 +               schedule_timeout(1);
21551 +#endif
21552 +}
21553 +
21554  /* Set a POSIX.1b interval timer. */
21555  /* timr->it_lock is taken. */
21556  static int
21557 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
21558         if (!timr)
21559                 return -EINVAL;
21560
21561 +       rcu_read_lock();
21562         kc = clockid_to_kclock(timr->it_clock);
21563         if (WARN_ON_ONCE(!kc || !kc->timer_set))
21564                 error = -EINVAL;
21565 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
21566
21567         unlock_timer(timr, flag);
21568         if (error == TIMER_RETRY) {
21569 +               timer_wait_for_callback(kc, timr);
21570                 rtn = NULL;     // We already got the old time...
21571 +               rcu_read_unlock();
21572                 goto retry;
21573         }
21574 +       rcu_read_unlock();
21575
21576         if (old_setting && !error &&
21577             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
21578 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
21579         if (!timer)
21580                 return -EINVAL;
21581
21582 +       rcu_read_lock();
21583         if (timer_delete_hook(timer) == TIMER_RETRY) {
21584                 unlock_timer(timer, flags);
21585 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
21586 +                                       timer);
21587 +               rcu_read_unlock();
21588                 goto retry_delete;
21589         }
21590 +       rcu_read_unlock();
21591
21592         spin_lock(&current->sighand->siglock);
21593         list_del(&timer->list);
21594 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
21595  retry_delete:
21596         spin_lock_irqsave(&timer->it_lock, flags);
21597
21598 +       /* On RT we can race with a deletion */
21599 +       if (!timer->it_signal) {
21600 +               unlock_timer(timer, flags);
21601 +               return;
21602 +       }
21603 +
21604         if (timer_delete_hook(timer) == TIMER_RETRY) {
21605 +               rcu_read_lock();
21606                 unlock_timer(timer, flags);
21607 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
21608 +                                       timer);
21609 +               rcu_read_unlock();
21610                 goto retry_delete;
21611         }
21612         list_del(&timer->list);
21613 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
21614 index 690b797f522e..fe8ba1619879 100644
21615 --- a/kernel/time/tick-broadcast-hrtimer.c
21616 +++ b/kernel/time/tick-broadcast-hrtimer.c
21617 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
21618  {
21619         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
21620         bctimer.function = bc_handler;
21621 +       bctimer.irqsafe = true;
21622         clockevents_register_device(&ce_broadcast_hrtimer);
21623  }
21624 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
21625 index 4fcd99e12aa0..5a47f2e98faf 100644
21626 --- a/kernel/time/tick-common.c
21627 +++ b/kernel/time/tick-common.c
21628 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
21629  static void tick_periodic(int cpu)
21630  {
21631         if (tick_do_timer_cpu == cpu) {
21632 -               write_seqlock(&jiffies_lock);
21633 +               raw_spin_lock(&jiffies_lock);
21634 +               write_seqcount_begin(&jiffies_seq);
21635
21636                 /* Keep track of the next tick event */
21637                 tick_next_period = ktime_add(tick_next_period, tick_period);
21638
21639                 do_timer(1);
21640 -               write_sequnlock(&jiffies_lock);
21641 +               write_seqcount_end(&jiffies_seq);
21642 +               raw_spin_unlock(&jiffies_lock);
21643                 update_wall_time();
21644         }
21645
21646 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
21647                 ktime_t next;
21648
21649                 do {
21650 -                       seq = read_seqbegin(&jiffies_lock);
21651 +                       seq = read_seqcount_begin(&jiffies_seq);
21652                         next = tick_next_period;
21653 -               } while (read_seqretry(&jiffies_lock, seq));
21654 +               } while (read_seqcount_retry(&jiffies_seq, seq));
21655
21656                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
21657
21658 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
21659 index 3bcb61b52f6c..66d85482a96e 100644
21660 --- a/kernel/time/tick-sched.c
21661 +++ b/kernel/time/tick-sched.c
21662 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
21663                 return;
21664
21665         /* Reevaluate with jiffies_lock held */
21666 -       write_seqlock(&jiffies_lock);
21667 +       raw_spin_lock(&jiffies_lock);
21668 +       write_seqcount_begin(&jiffies_seq);
21669
21670         delta = ktime_sub(now, last_jiffies_update);
21671         if (delta.tv64 >= tick_period.tv64) {
21672 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
21673                 /* Keep the tick_next_period variable up to date */
21674                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
21675         } else {
21676 -               write_sequnlock(&jiffies_lock);
21677 +               write_seqcount_end(&jiffies_seq);
21678 +               raw_spin_unlock(&jiffies_lock);
21679                 return;
21680         }
21681 -       write_sequnlock(&jiffies_lock);
21682 +       write_seqcount_end(&jiffies_seq);
21683 +       raw_spin_unlock(&jiffies_lock);
21684         update_wall_time();
21685  }
21686
21687 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
21688  {
21689         ktime_t period;
21690
21691 -       write_seqlock(&jiffies_lock);
21692 +       raw_spin_lock(&jiffies_lock);
21693 +       write_seqcount_begin(&jiffies_seq);
21694         /* Did we start the jiffies update yet ? */
21695         if (last_jiffies_update.tv64 == 0)
21696                 last_jiffies_update = tick_next_period;
21697         period = last_jiffies_update;
21698 -       write_sequnlock(&jiffies_lock);
21699 +       write_seqcount_end(&jiffies_seq);
21700 +       raw_spin_unlock(&jiffies_lock);
21701         return period;
21702  }
21703
21704 @@ -215,6 +220,7 @@ static void nohz_full_kick_func(struct irq_work *work)
21705
21706  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
21707         .func = nohz_full_kick_func,
21708 +       .flags = IRQ_WORK_HARD_IRQ,
21709  };
21710
21711  /*
21712 @@ -673,10 +679,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
21713
21714         /* Read jiffies and the time when jiffies were updated last */
21715         do {
21716 -               seq = read_seqbegin(&jiffies_lock);
21717 +               seq = read_seqcount_begin(&jiffies_seq);
21718                 basemono = last_jiffies_update.tv64;
21719                 basejiff = jiffies;
21720 -       } while (read_seqretry(&jiffies_lock, seq));
21721 +       } while (read_seqcount_retry(&jiffies_seq, seq));
21722         ts->last_jiffies = basejiff;
21723
21724         if (rcu_needs_cpu(basemono, &next_rcu) ||
21725 @@ -877,14 +883,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
21726                 return false;
21727
21728         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
21729 -               static int ratelimit;
21730 -
21731 -               if (ratelimit < 10 &&
21732 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
21733 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
21734 -                               (unsigned int) local_softirq_pending());
21735 -                       ratelimit++;
21736 -               }
21737 +               softirq_check_pending_idle();
21738                 return false;
21739         }
21740
21741 @@ -1193,6 +1192,7 @@ void tick_setup_sched_timer(void)
21742          * Emulate tick processing via per-CPU hrtimers:
21743          */
21744         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
21745 +       ts->sched_timer.irqsafe = 1;
21746         ts->sched_timer.function = tick_sched_timer;
21747
21748         /* Get the next period (per-CPU) */
21749 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
21750 index d831827d7ab0..76d982c11ac3 100644
21751 --- a/kernel/time/timekeeping.c
21752 +++ b/kernel/time/timekeeping.c
21753 @@ -2348,8 +2348,10 @@ EXPORT_SYMBOL(hardpps);
21754   */
21755  void xtime_update(unsigned long ticks)
21756  {
21757 -       write_seqlock(&jiffies_lock);
21758 +       raw_spin_lock(&jiffies_lock);
21759 +       write_seqcount_begin(&jiffies_seq);
21760         do_timer(ticks);
21761 -       write_sequnlock(&jiffies_lock);
21762 +       write_seqcount_end(&jiffies_seq);
21763 +       raw_spin_unlock(&jiffies_lock);
21764         update_wall_time();
21765  }
21766 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
21767 index 704f595ce83f..763a3e5121ff 100644
21768 --- a/kernel/time/timekeeping.h
21769 +++ b/kernel/time/timekeeping.h
21770 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
21771  extern void do_timer(unsigned long ticks);
21772  extern void update_wall_time(void);
21773
21774 -extern seqlock_t jiffies_lock;
21775 +extern raw_spinlock_t jiffies_lock;
21776 +extern seqcount_t jiffies_seq;
21777
21778  #define CS_NAME_LEN    32
21779
21780 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
21781 index 7d670362891a..31703677505f 100644
21782 --- a/kernel/time/timer.c
21783 +++ b/kernel/time/timer.c
21784 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
21785  #endif
21786
21787  struct timer_base {
21788 -       spinlock_t              lock;
21789 +       raw_spinlock_t          lock;
21790         struct timer_list       *running_timer;
21791 +#ifdef CONFIG_PREEMPT_RT_FULL
21792 +       struct swait_queue_head wait_for_running_timer;
21793 +#endif
21794         unsigned long           clk;
21795         unsigned long           next_expiry;
21796         unsigned int            cpu;
21797 @@ -955,10 +958,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
21798
21799                 if (!(tf & TIMER_MIGRATING)) {
21800                         base = get_timer_base(tf);
21801 -                       spin_lock_irqsave(&base->lock, *flags);
21802 +                       raw_spin_lock_irqsave(&base->lock, *flags);
21803                         if (timer->flags == tf)
21804                                 return base;
21805 -                       spin_unlock_irqrestore(&base->lock, *flags);
21806 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
21807                 }
21808                 cpu_relax();
21809         }
21810 @@ -1037,9 +1040,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
21811                         /* See the comment in lock_timer_base() */
21812                         timer->flags |= TIMER_MIGRATING;
21813
21814 -                       spin_unlock(&base->lock);
21815 +                       raw_spin_unlock(&base->lock);
21816                         base = new_base;
21817 -                       spin_lock(&base->lock);
21818 +                       raw_spin_lock(&base->lock);
21819                         WRITE_ONCE(timer->flags,
21820                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
21821                         forward_timer_base(base);
21822 @@ -1062,7 +1065,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
21823         }
21824
21825  out_unlock:
21826 -       spin_unlock_irqrestore(&base->lock, flags);
21827 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21828
21829         return ret;
21830  }
21831 @@ -1156,9 +1159,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
21832         if (base != new_base) {
21833                 timer->flags |= TIMER_MIGRATING;
21834
21835 -               spin_unlock(&base->lock);
21836 +               raw_spin_unlock(&base->lock);
21837                 base = new_base;
21838 -               spin_lock(&base->lock);
21839 +               raw_spin_lock(&base->lock);
21840                 WRITE_ONCE(timer->flags,
21841                            (timer->flags & ~TIMER_BASEMASK) | cpu);
21842         }
21843 @@ -1166,10 +1169,37 @@ void add_timer_on(struct timer_list *timer, int cpu)
21844
21845         debug_activate(timer, timer->expires);
21846         internal_add_timer(base, timer);
21847 -       spin_unlock_irqrestore(&base->lock, flags);
21848 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21849  }
21850  EXPORT_SYMBOL_GPL(add_timer_on);
21851
21852 +#ifdef CONFIG_PREEMPT_RT_FULL
21853 +/*
21854 + * Wait for a running timer
21855 + */
21856 +static void wait_for_running_timer(struct timer_list *timer)
21857 +{
21858 +       struct timer_base *base;
21859 +       u32 tf = timer->flags;
21860 +
21861 +       if (tf & TIMER_MIGRATING)
21862 +               return;
21863 +
21864 +       base = get_timer_base(tf);
21865 +       swait_event(base->wait_for_running_timer,
21866 +                  base->running_timer != timer);
21867 +}
21868 +
21869 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
21870 +#else
21871 +static inline void wait_for_running_timer(struct timer_list *timer)
21872 +{
21873 +       cpu_relax();
21874 +}
21875 +
21876 +# define wakeup_timer_waiters(b)       do { } while (0)
21877 +#endif
21878 +
21879  /**
21880   * del_timer - deactive a timer.
21881   * @timer: the timer to be deactivated
21882 @@ -1193,7 +1223,7 @@ int del_timer(struct timer_list *timer)
21883         if (timer_pending(timer)) {
21884                 base = lock_timer_base(timer, &flags);
21885                 ret = detach_if_pending(timer, base, true);
21886 -               spin_unlock_irqrestore(&base->lock, flags);
21887 +               raw_spin_unlock_irqrestore(&base->lock, flags);
21888         }
21889
21890         return ret;
21891 @@ -1221,13 +1251,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
21892                 timer_stats_timer_clear_start_info(timer);
21893                 ret = detach_if_pending(timer, base, true);
21894         }
21895 -       spin_unlock_irqrestore(&base->lock, flags);
21896 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21897
21898         return ret;
21899  }
21900  EXPORT_SYMBOL(try_to_del_timer_sync);
21901
21902 -#ifdef CONFIG_SMP
21903 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
21904  /**
21905   * del_timer_sync - deactivate a timer and wait for the handler to finish.
21906   * @timer: the timer to be deactivated
21907 @@ -1287,7 +1317,7 @@ int del_timer_sync(struct timer_list *timer)
21908                 int ret = try_to_del_timer_sync(timer);
21909                 if (ret >= 0)
21910                         return ret;
21911 -               cpu_relax();
21912 +               wait_for_running_timer(timer);
21913         }
21914  }
21915  EXPORT_SYMBOL(del_timer_sync);
21916 @@ -1352,14 +1382,17 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
21917                 fn = timer->function;
21918                 data = timer->data;
21919
21920 -               if (timer->flags & TIMER_IRQSAFE) {
21921 -                       spin_unlock(&base->lock);
21922 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
21923 +                   timer->flags & TIMER_IRQSAFE) {
21924 +                       raw_spin_unlock(&base->lock);
21925                         call_timer_fn(timer, fn, data);
21926 -                       spin_lock(&base->lock);
21927 +                       base->running_timer = NULL;
21928 +                       raw_spin_lock(&base->lock);
21929                 } else {
21930 -                       spin_unlock_irq(&base->lock);
21931 +                       raw_spin_unlock_irq(&base->lock);
21932                         call_timer_fn(timer, fn, data);
21933 -                       spin_lock_irq(&base->lock);
21934 +                       base->running_timer = NULL;
21935 +                       raw_spin_lock_irq(&base->lock);
21936                 }
21937         }
21938  }
21939 @@ -1528,7 +1561,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
21940         if (cpu_is_offline(smp_processor_id()))
21941                 return expires;
21942
21943 -       spin_lock(&base->lock);
21944 +       raw_spin_lock(&base->lock);
21945         nextevt = __next_timer_interrupt(base);
21946         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
21947         base->next_expiry = nextevt;
21948 @@ -1562,7 +1595,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
21949                         base->is_idle = true;
21950                 }
21951         }
21952 -       spin_unlock(&base->lock);
21953 +       raw_spin_unlock(&base->lock);
21954
21955         return cmp_next_hrtimer_event(basem, expires);
21956  }
21957 @@ -1627,13 +1660,13 @@ void update_process_times(int user_tick)
21958
21959         /* Note: this timer irq context must be accounted for as well. */
21960         account_process_tick(p, user_tick);
21961 +       scheduler_tick();
21962         run_local_timers();
21963         rcu_check_callbacks(user_tick);
21964 -#ifdef CONFIG_IRQ_WORK
21965 +#if defined(CONFIG_IRQ_WORK)
21966         if (in_irq())
21967                 irq_work_tick();
21968  #endif
21969 -       scheduler_tick();
21970         run_posix_cpu_timers(p);
21971  }
21972
21973 @@ -1649,7 +1682,7 @@ static inline void __run_timers(struct timer_base *base)
21974         if (!time_after_eq(jiffies, base->clk))
21975                 return;
21976
21977 -       spin_lock_irq(&base->lock);
21978 +       raw_spin_lock_irq(&base->lock);
21979
21980         while (time_after_eq(jiffies, base->clk)) {
21981
21982 @@ -1659,8 +1692,8 @@ static inline void __run_timers(struct timer_base *base)
21983                 while (levels--)
21984                         expire_timers(base, heads + levels);
21985         }
21986 -       base->running_timer = NULL;
21987 -       spin_unlock_irq(&base->lock);
21988 +       raw_spin_unlock_irq(&base->lock);
21989 +       wakeup_timer_waiters(base);
21990  }
21991
21992  /*
21993 @@ -1683,6 +1716,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
21994          */
21995         base->must_forward_clk = false;
21996
21997 +       irq_work_tick_soft();
21998 +
21999         __run_timers(base);
22000         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
22001                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
22002 @@ -1868,16 +1903,16 @@ int timers_dead_cpu(unsigned int cpu)
22003                  * The caller is globally serialized and nobody else
22004                  * takes two locks at once, deadlock is not possible.
22005                  */
22006 -               spin_lock_irq(&new_base->lock);
22007 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
22008 +               raw_spin_lock_irq(&new_base->lock);
22009 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
22010
22011                 BUG_ON(old_base->running_timer);
22012
22013                 for (i = 0; i < WHEEL_SIZE; i++)
22014                         migrate_timer_list(new_base, old_base->vectors + i);
22015
22016 -               spin_unlock(&old_base->lock);
22017 -               spin_unlock_irq(&new_base->lock);
22018 +               raw_spin_unlock(&old_base->lock);
22019 +               raw_spin_unlock_irq(&new_base->lock);
22020                 put_cpu_ptr(&timer_bases);
22021         }
22022         return 0;
22023 @@ -1893,8 +1928,11 @@ static void __init init_timer_cpu(int cpu)
22024         for (i = 0; i < NR_BASES; i++) {
22025                 base = per_cpu_ptr(&timer_bases[i], cpu);
22026                 base->cpu = cpu;
22027 -               spin_lock_init(&base->lock);
22028 +               raw_spin_lock_init(&base->lock);
22029                 base->clk = jiffies;
22030 +#ifdef CONFIG_PREEMPT_RT_FULL
22031 +               init_swait_queue_head(&base->wait_for_running_timer);
22032 +#endif
22033         }
22034  }
22035
22036 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
22037 index 2a96b063d659..812e37237eb8 100644
22038 --- a/kernel/trace/Kconfig
22039 +++ b/kernel/trace/Kconfig
22040 @@ -182,6 +182,24 @@ config IRQSOFF_TRACER
22041           enabled. This option and the preempt-off timing option can be
22042           used together or separately.)
22043
22044 +config INTERRUPT_OFF_HIST
22045 +       bool "Interrupts-off Latency Histogram"
22046 +       depends on IRQSOFF_TRACER
22047 +       help
22048 +         This option generates continuously updated histograms (one per cpu)
22049 +         of the duration of time periods with interrupts disabled. The
22050 +         histograms are disabled by default. To enable them, write a non-zero
22051 +         number to
22052 +
22053 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
22054 +
22055 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
22056 +         per cpu) are generated that accumulate the duration of time periods
22057 +         when both interrupts and preemption are disabled. The histogram data
22058 +         will be located in the debug file system at
22059 +
22060 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
22061 +
22062  config PREEMPT_TRACER
22063         bool "Preemption-off Latency Tracer"
22064         default n
22065 @@ -206,6 +224,24 @@ config PREEMPT_TRACER
22066           enabled. This option and the irqs-off timing option can be
22067           used together or separately.)
22068
22069 +config PREEMPT_OFF_HIST
22070 +       bool "Preemption-off Latency Histogram"
22071 +       depends on PREEMPT_TRACER
22072 +       help
22073 +         This option generates continuously updated histograms (one per cpu)
22074 +         of the duration of time periods with preemption disabled. The
22075 +         histograms are disabled by default. To enable them, write a non-zero
22076 +         number to
22077 +
22078 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
22079 +
22080 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
22081 +         per cpu) are generated that accumulate the duration of time periods
22082 +         when both interrupts and preemption are disabled. The histogram data
22083 +         will be located in the debug file system at
22084 +
22085 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
22086 +
22087  config SCHED_TRACER
22088         bool "Scheduling Latency Tracer"
22089         select GENERIC_TRACER
22090 @@ -251,6 +287,74 @@ config HWLAT_TRACER
22091          file. Every time a latency is greater than tracing_thresh, it will
22092          be recorded into the ring buffer.
22093
22094 +config WAKEUP_LATENCY_HIST
22095 +       bool "Scheduling Latency Histogram"
22096 +       depends on SCHED_TRACER
22097 +       help
22098 +         This option generates continuously updated histograms (one per cpu)
22099 +         of the scheduling latency of the highest priority task.
22100 +         The histograms are disabled by default. To enable them, write a
22101 +         non-zero number to
22102 +
22103 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
22104 +
22105 +         Two different algorithms are used, one to determine the latency of
22106 +         processes that exclusively use the highest priority of the system and
22107 +         another one to determine the latency of processes that share the
22108 +         highest system priority with other processes. The former is used to
22109 +         improve hardware and system software, the latter to optimize the
22110 +         priority design of a given system. The histogram data will be
22111 +         located in the debug file system at
22112 +
22113 +             /sys/kernel/debug/tracing/latency_hist/wakeup
22114 +
22115 +         and
22116 +
22117 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
22118 +
22119 +         If both Scheduling Latency Histogram and Missed Timer Offsets
22120 +         Histogram are selected, additional histogram data will be collected
22121 +         that contain, in addition to the wakeup latency, the timer latency, in
22122 +         case the wakeup was triggered by an expired timer. These histograms
22123 +         are available in the
22124 +
22125 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
22126 +
22127 +         directory. They reflect the apparent interrupt and scheduling latency
22128 +         and are best suitable to determine the worst-case latency of a given
22129 +         system. To enable these histograms, write a non-zero number to
22130 +
22131 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
22132 +
22133 +config MISSED_TIMER_OFFSETS_HIST
22134 +       depends on HIGH_RES_TIMERS
22135 +       select GENERIC_TRACER
22136 +       bool "Missed Timer Offsets Histogram"
22137 +       help
22138 +         Generate a histogram of missed timer offsets in microseconds. The
22139 +         histograms are disabled by default. To enable them, write a non-zero
22140 +         number to
22141 +
22142 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
22143 +
22144 +         The histogram data will be located in the debug file system at
22145 +
22146 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
22147 +
22148 +         If both Scheduling Latency Histogram and Missed Timer Offsets
22149 +         Histogram are selected, additional histogram data will be collected
22150 +         that contain, in addition to the wakeup latency, the timer latency, in
22151 +         case the wakeup was triggered by an expired timer. These histograms
22152 +         are available in the
22153 +
22154 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
22155 +
22156 +         directory. They reflect the apparent interrupt and scheduling latency
22157 +         and are best suitable to determine the worst-case latency of a given
22158 +         system. To enable these histograms, write a non-zero number to
22159 +
22160 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
22161 +
22162  config ENABLE_DEFAULT_TRACERS
22163         bool "Trace process context switches and events"
22164         depends on !GENERIC_TRACER
22165 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
22166 index e57980845549..83af000b783c 100644
22167 --- a/kernel/trace/Makefile
22168 +++ b/kernel/trace/Makefile
22169 @@ -38,6 +38,10 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
22170  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
22171  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
22172  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
22173 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
22174 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
22175 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
22176 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
22177  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
22178  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
22179  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
22180 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
22181 new file mode 100644
22182 index 000000000000..7f6ee70dea41
22183 --- /dev/null
22184 +++ b/kernel/trace/latency_hist.c
22185 @@ -0,0 +1,1178 @@
22186 +/*
22187 + * kernel/trace/latency_hist.c
22188 + *
22189 + * Add support for histograms of preemption-off latency and
22190 + * interrupt-off latency and wakeup latency, it depends on
22191 + * Real-Time Preemption Support.
22192 + *
22193 + *  Copyright (C) 2005 MontaVista Software, Inc.
22194 + *  Yi Yang <yyang@ch.mvista.com>
22195 + *
22196 + *  Converted to work with the new latency tracer.
22197 + *  Copyright (C) 2008 Red Hat, Inc.
22198 + *    Steven Rostedt <srostedt@redhat.com>
22199 + *
22200 + */
22201 +#include <linux/module.h>
22202 +#include <linux/debugfs.h>
22203 +#include <linux/seq_file.h>
22204 +#include <linux/percpu.h>
22205 +#include <linux/kallsyms.h>
22206 +#include <linux/uaccess.h>
22207 +#include <linux/sched.h>
22208 +#include <linux/sched/rt.h>
22209 +#include <linux/slab.h>
22210 +#include <linux/atomic.h>
22211 +#include <asm/div64.h>
22212 +
22213 +#include "trace.h"
22214 +#include <trace/events/sched.h>
22215 +
22216 +#define NSECS_PER_USECS 1000L
22217 +
22218 +#define CREATE_TRACE_POINTS
22219 +#include <trace/events/hist.h>
22220 +
22221 +enum {
22222 +       IRQSOFF_LATENCY = 0,
22223 +       PREEMPTOFF_LATENCY,
22224 +       PREEMPTIRQSOFF_LATENCY,
22225 +       WAKEUP_LATENCY,
22226 +       WAKEUP_LATENCY_SHAREDPRIO,
22227 +       MISSED_TIMER_OFFSETS,
22228 +       TIMERANDWAKEUP_LATENCY,
22229 +       MAX_LATENCY_TYPE,
22230 +};
22231 +
22232 +#define MAX_ENTRY_NUM 10240
22233 +
22234 +struct hist_data {
22235 +       atomic_t hist_mode; /* 0 log, 1 don't log */
22236 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
22237 +       long min_lat;
22238 +       long max_lat;
22239 +       unsigned long long below_hist_bound_samples;
22240 +       unsigned long long above_hist_bound_samples;
22241 +       long long accumulate_lat;
22242 +       unsigned long long total_samples;
22243 +       unsigned long long hist_array[MAX_ENTRY_NUM];
22244 +};
22245 +
22246 +struct enable_data {
22247 +       int latency_type;
22248 +       int enabled;
22249 +};
22250 +
22251 +static char *latency_hist_dir_root = "latency_hist";
22252 +
22253 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22254 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
22255 +static char *irqsoff_hist_dir = "irqsoff";
22256 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
22257 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
22258 +#endif
22259 +
22260 +#ifdef CONFIG_PREEMPT_OFF_HIST
22261 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
22262 +static char *preemptoff_hist_dir = "preemptoff";
22263 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
22264 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
22265 +#endif
22266 +
22267 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
22268 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
22269 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
22270 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
22271 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
22272 +#endif
22273 +
22274 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
22275 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
22276 +static struct enable_data preemptirqsoff_enabled_data = {
22277 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
22278 +       .enabled = 0,
22279 +};
22280 +#endif
22281 +
22282 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22283 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22284 +struct maxlatproc_data {
22285 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
22286 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
22287 +       int pid;
22288 +       int current_pid;
22289 +       int prio;
22290 +       int current_prio;
22291 +       long latency;
22292 +       long timeroffset;
22293 +       cycle_t timestamp;
22294 +};
22295 +#endif
22296 +
22297 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22298 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
22299 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
22300 +static char *wakeup_latency_hist_dir = "wakeup";
22301 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
22302 +static notrace void probe_wakeup_latency_hist_start(void *v,
22303 +       struct task_struct *p);
22304 +static notrace void probe_wakeup_latency_hist_stop(void *v,
22305 +       bool preempt, struct task_struct *prev, struct task_struct *next);
22306 +static notrace void probe_sched_migrate_task(void *,
22307 +       struct task_struct *task, int cpu);
22308 +static struct enable_data wakeup_latency_enabled_data = {
22309 +       .latency_type = WAKEUP_LATENCY,
22310 +       .enabled = 0,
22311 +};
22312 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
22313 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
22314 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
22315 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
22316 +static unsigned long wakeup_pid;
22317 +#endif
22318 +
22319 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22320 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
22321 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
22322 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
22323 +       long long offset, struct task_struct *curr, struct task_struct *task);
22324 +static struct enable_data missed_timer_offsets_enabled_data = {
22325 +       .latency_type = MISSED_TIMER_OFFSETS,
22326 +       .enabled = 0,
22327 +};
22328 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
22329 +static unsigned long missed_timer_offsets_pid;
22330 +#endif
22331 +
22332 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22333 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22334 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
22335 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
22336 +static struct enable_data timerandwakeup_enabled_data = {
22337 +       .latency_type = TIMERANDWAKEUP_LATENCY,
22338 +       .enabled = 0,
22339 +};
22340 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
22341 +#endif
22342 +
22343 +void notrace latency_hist(int latency_type, int cpu, long latency,
22344 +                         long timeroffset, cycle_t stop,
22345 +                         struct task_struct *p)
22346 +{
22347 +       struct hist_data *my_hist;
22348 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22349 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22350 +       struct maxlatproc_data *mp = NULL;
22351 +#endif
22352 +
22353 +       if (!cpu_possible(cpu) || latency_type < 0 ||
22354 +           latency_type >= MAX_LATENCY_TYPE)
22355 +               return;
22356 +
22357 +       switch (latency_type) {
22358 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22359 +       case IRQSOFF_LATENCY:
22360 +               my_hist = &per_cpu(irqsoff_hist, cpu);
22361 +               break;
22362 +#endif
22363 +#ifdef CONFIG_PREEMPT_OFF_HIST
22364 +       case PREEMPTOFF_LATENCY:
22365 +               my_hist = &per_cpu(preemptoff_hist, cpu);
22366 +               break;
22367 +#endif
22368 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
22369 +       case PREEMPTIRQSOFF_LATENCY:
22370 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
22371 +               break;
22372 +#endif
22373 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22374 +       case WAKEUP_LATENCY:
22375 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
22376 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
22377 +               break;
22378 +       case WAKEUP_LATENCY_SHAREDPRIO:
22379 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
22380 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
22381 +               break;
22382 +#endif
22383 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22384 +       case MISSED_TIMER_OFFSETS:
22385 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
22386 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
22387 +               break;
22388 +#endif
22389 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22390 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22391 +       case TIMERANDWAKEUP_LATENCY:
22392 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
22393 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
22394 +               break;
22395 +#endif
22396 +
22397 +       default:
22398 +               return;
22399 +       }
22400 +
22401 +       latency += my_hist->offset;
22402 +
22403 +       if (atomic_read(&my_hist->hist_mode) == 0)
22404 +               return;
22405 +
22406 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
22407 +               if (latency < 0)
22408 +                       my_hist->below_hist_bound_samples++;
22409 +               else
22410 +                       my_hist->above_hist_bound_samples++;
22411 +       } else
22412 +               my_hist->hist_array[latency]++;
22413 +
22414 +       if (unlikely(latency > my_hist->max_lat ||
22415 +           my_hist->min_lat == LONG_MAX)) {
22416 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22417 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22418 +               if (latency_type == WAKEUP_LATENCY ||
22419 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
22420 +                   latency_type == MISSED_TIMER_OFFSETS ||
22421 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
22422 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
22423 +                       strncpy(mp->current_comm, current->comm,
22424 +                           sizeof(mp->current_comm));
22425 +                       mp->pid = task_pid_nr(p);
22426 +                       mp->current_pid = task_pid_nr(current);
22427 +                       mp->prio = p->prio;
22428 +                       mp->current_prio = current->prio;
22429 +                       mp->latency = latency;
22430 +                       mp->timeroffset = timeroffset;
22431 +                       mp->timestamp = stop;
22432 +               }
22433 +#endif
22434 +               my_hist->max_lat = latency;
22435 +       }
22436 +       if (unlikely(latency < my_hist->min_lat))
22437 +               my_hist->min_lat = latency;
22438 +       my_hist->total_samples++;
22439 +       my_hist->accumulate_lat += latency;
22440 +}
22441 +
22442 +static void *l_start(struct seq_file *m, loff_t *pos)
22443 +{
22444 +       loff_t *index_ptr = NULL;
22445 +       loff_t index = *pos;
22446 +       struct hist_data *my_hist = m->private;
22447 +
22448 +       if (index == 0) {
22449 +               char minstr[32], avgstr[32], maxstr[32];
22450 +
22451 +               atomic_dec(&my_hist->hist_mode);
22452 +
22453 +               if (likely(my_hist->total_samples)) {
22454 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
22455 +                           my_hist->total_samples);
22456 +                       snprintf(minstr, sizeof(minstr), "%ld",
22457 +                           my_hist->min_lat - my_hist->offset);
22458 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
22459 +                           avg - my_hist->offset);
22460 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
22461 +                           my_hist->max_lat - my_hist->offset);
22462 +               } else {
22463 +                       strcpy(minstr, "<undef>");
22464 +                       strcpy(avgstr, minstr);
22465 +                       strcpy(maxstr, minstr);
22466 +               }
22467 +
22468 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
22469 +                          "#Average latency: %s microseconds\n"
22470 +                          "#Maximum latency: %s microseconds\n"
22471 +                          "#Total samples: %llu\n"
22472 +                          "#There are %llu samples lower than %ld"
22473 +                          " microseconds.\n"
22474 +                          "#There are %llu samples greater or equal"
22475 +                          " than %ld microseconds.\n"
22476 +                          "#usecs\t%16s\n",
22477 +                          minstr, avgstr, maxstr,
22478 +                          my_hist->total_samples,
22479 +                          my_hist->below_hist_bound_samples,
22480 +                          -my_hist->offset,
22481 +                          my_hist->above_hist_bound_samples,
22482 +                          MAX_ENTRY_NUM - my_hist->offset,
22483 +                          "samples");
22484 +       }
22485 +       if (index < MAX_ENTRY_NUM) {
22486 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
22487 +               if (index_ptr)
22488 +                       *index_ptr = index;
22489 +       }
22490 +
22491 +       return index_ptr;
22492 +}
22493 +
22494 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
22495 +{
22496 +       loff_t *index_ptr = p;
22497 +       struct hist_data *my_hist = m->private;
22498 +
22499 +       if (++*pos >= MAX_ENTRY_NUM) {
22500 +               atomic_inc(&my_hist->hist_mode);
22501 +               return NULL;
22502 +       }
22503 +       *index_ptr = *pos;
22504 +       return index_ptr;
22505 +}
22506 +
22507 +static void l_stop(struct seq_file *m, void *p)
22508 +{
22509 +       kfree(p);
22510 +}
22511 +
22512 +static int l_show(struct seq_file *m, void *p)
22513 +{
22514 +       int index = *(loff_t *) p;
22515 +       struct hist_data *my_hist = m->private;
22516 +
22517 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
22518 +           my_hist->hist_array[index]);
22519 +       return 0;
22520 +}
22521 +
22522 +static const struct seq_operations latency_hist_seq_op = {
22523 +       .start = l_start,
22524 +       .next  = l_next,
22525 +       .stop  = l_stop,
22526 +       .show  = l_show
22527 +};
22528 +
22529 +static int latency_hist_open(struct inode *inode, struct file *file)
22530 +{
22531 +       int ret;
22532 +
22533 +       ret = seq_open(file, &latency_hist_seq_op);
22534 +       if (!ret) {
22535 +               struct seq_file *seq = file->private_data;
22536 +               seq->private = inode->i_private;
22537 +       }
22538 +       return ret;
22539 +}
22540 +
22541 +static const struct file_operations latency_hist_fops = {
22542 +       .open = latency_hist_open,
22543 +       .read = seq_read,
22544 +       .llseek = seq_lseek,
22545 +       .release = seq_release,
22546 +};
22547 +
22548 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22549 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22550 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
22551 +{
22552 +       mp->comm[0] = mp->current_comm[0] = '\0';
22553 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
22554 +           mp->latency = mp->timeroffset = -1;
22555 +       mp->timestamp = 0;
22556 +}
22557 +#endif
22558 +
22559 +static void hist_reset(struct hist_data *hist)
22560 +{
22561 +       atomic_dec(&hist->hist_mode);
22562 +
22563 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
22564 +       hist->below_hist_bound_samples = 0ULL;
22565 +       hist->above_hist_bound_samples = 0ULL;
22566 +       hist->min_lat = LONG_MAX;
22567 +       hist->max_lat = LONG_MIN;
22568 +       hist->total_samples = 0ULL;
22569 +       hist->accumulate_lat = 0LL;
22570 +
22571 +       atomic_inc(&hist->hist_mode);
22572 +}
22573 +
22574 +static ssize_t
22575 +latency_hist_reset(struct file *file, const char __user *a,
22576 +                  size_t size, loff_t *off)
22577 +{
22578 +       int cpu;
22579 +       struct hist_data *hist = NULL;
22580 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22581 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22582 +       struct maxlatproc_data *mp = NULL;
22583 +#endif
22584 +       off_t latency_type = (off_t) file->private_data;
22585 +
22586 +       for_each_online_cpu(cpu) {
22587 +
22588 +               switch (latency_type) {
22589 +#ifdef CONFIG_PREEMPT_OFF_HIST
22590 +               case PREEMPTOFF_LATENCY:
22591 +                       hist = &per_cpu(preemptoff_hist, cpu);
22592 +                       break;
22593 +#endif
22594 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22595 +               case IRQSOFF_LATENCY:
22596 +                       hist = &per_cpu(irqsoff_hist, cpu);
22597 +                       break;
22598 +#endif
22599 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22600 +               case PREEMPTIRQSOFF_LATENCY:
22601 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
22602 +                       break;
22603 +#endif
22604 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22605 +               case WAKEUP_LATENCY:
22606 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
22607 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
22608 +                       break;
22609 +               case WAKEUP_LATENCY_SHAREDPRIO:
22610 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
22611 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
22612 +                       break;
22613 +#endif
22614 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22615 +               case MISSED_TIMER_OFFSETS:
22616 +                       hist = &per_cpu(missed_timer_offsets, cpu);
22617 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
22618 +                       break;
22619 +#endif
22620 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22621 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22622 +               case TIMERANDWAKEUP_LATENCY:
22623 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
22624 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
22625 +                       break;
22626 +#endif
22627 +               }
22628 +
22629 +               hist_reset(hist);
22630 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22631 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22632 +               if (latency_type == WAKEUP_LATENCY ||
22633 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
22634 +                   latency_type == MISSED_TIMER_OFFSETS ||
22635 +                   latency_type == TIMERANDWAKEUP_LATENCY)
22636 +                       clear_maxlatprocdata(mp);
22637 +#endif
22638 +       }
22639 +
22640 +       return size;
22641 +}
22642 +
22643 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22644 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22645 +static ssize_t
22646 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22647 +{
22648 +       char buf[64];
22649 +       int r;
22650 +       unsigned long *this_pid = file->private_data;
22651 +
22652 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
22653 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22654 +}
22655 +
22656 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
22657 +                     size_t cnt, loff_t *ppos)
22658 +{
22659 +       char buf[64];
22660 +       unsigned long pid;
22661 +       unsigned long *this_pid = file->private_data;
22662 +
22663 +       if (cnt >= sizeof(buf))
22664 +               return -EINVAL;
22665 +
22666 +       if (copy_from_user(&buf, ubuf, cnt))
22667 +               return -EFAULT;
22668 +
22669 +       buf[cnt] = '\0';
22670 +
22671 +       if (kstrtoul(buf, 10, &pid))
22672 +               return -EINVAL;
22673 +
22674 +       *this_pid = pid;
22675 +
22676 +       return cnt;
22677 +}
22678 +#endif
22679 +
22680 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22681 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22682 +static ssize_t
22683 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22684 +{
22685 +       int r;
22686 +       struct maxlatproc_data *mp = file->private_data;
22687 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
22688 +       unsigned long long t;
22689 +       unsigned long usecs, secs;
22690 +       char *buf;
22691 +
22692 +       if (mp->pid == -1 || mp->current_pid == -1) {
22693 +               buf = "(none)\n";
22694 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
22695 +                   strlen(buf));
22696 +       }
22697 +
22698 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
22699 +       if (buf == NULL)
22700 +               return -ENOMEM;
22701 +
22702 +       t = ns2usecs(mp->timestamp);
22703 +       usecs = do_div(t, USEC_PER_SEC);
22704 +       secs = (unsigned long) t;
22705 +       r = snprintf(buf, strmaxlen,
22706 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
22707 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
22708 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
22709 +           secs, usecs);
22710 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22711 +       kfree(buf);
22712 +       return r;
22713 +}
22714 +#endif
22715 +
22716 +static ssize_t
22717 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22718 +{
22719 +       char buf[64];
22720 +       struct enable_data *ed = file->private_data;
22721 +       int r;
22722 +
22723 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
22724 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22725 +}
22726 +
22727 +static ssize_t
22728 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
22729 +{
22730 +       char buf[64];
22731 +       long enable;
22732 +       struct enable_data *ed = file->private_data;
22733 +
22734 +       if (cnt >= sizeof(buf))
22735 +               return -EINVAL;
22736 +
22737 +       if (copy_from_user(&buf, ubuf, cnt))
22738 +               return -EFAULT;
22739 +
22740 +       buf[cnt] = 0;
22741 +
22742 +       if (kstrtoul(buf, 10, &enable))
22743 +               return -EINVAL;
22744 +
22745 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
22746 +               return cnt;
22747 +
22748 +       if (enable) {
22749 +               int ret;
22750 +
22751 +               switch (ed->latency_type) {
22752 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22753 +               case PREEMPTIRQSOFF_LATENCY:
22754 +                       ret = register_trace_preemptirqsoff_hist(
22755 +                           probe_preemptirqsoff_hist, NULL);
22756 +                       if (ret) {
22757 +                               pr_info("wakeup trace: Couldn't assign "
22758 +                                   "probe_preemptirqsoff_hist "
22759 +                                   "to trace_preemptirqsoff_hist\n");
22760 +                               return ret;
22761 +                       }
22762 +                       break;
22763 +#endif
22764 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22765 +               case WAKEUP_LATENCY:
22766 +                       ret = register_trace_sched_wakeup(
22767 +                           probe_wakeup_latency_hist_start, NULL);
22768 +                       if (ret) {
22769 +                               pr_info("wakeup trace: Couldn't assign "
22770 +                                   "probe_wakeup_latency_hist_start "
22771 +                                   "to trace_sched_wakeup\n");
22772 +                               return ret;
22773 +                       }
22774 +                       ret = register_trace_sched_wakeup_new(
22775 +                           probe_wakeup_latency_hist_start, NULL);
22776 +                       if (ret) {
22777 +                               pr_info("wakeup trace: Couldn't assign "
22778 +                                   "probe_wakeup_latency_hist_start "
22779 +                                   "to trace_sched_wakeup_new\n");
22780 +                               unregister_trace_sched_wakeup(
22781 +                                   probe_wakeup_latency_hist_start, NULL);
22782 +                               return ret;
22783 +                       }
22784 +                       ret = register_trace_sched_switch(
22785 +                           probe_wakeup_latency_hist_stop, NULL);
22786 +                       if (ret) {
22787 +                               pr_info("wakeup trace: Couldn't assign "
22788 +                                   "probe_wakeup_latency_hist_stop "
22789 +                                   "to trace_sched_switch\n");
22790 +                               unregister_trace_sched_wakeup(
22791 +                                   probe_wakeup_latency_hist_start, NULL);
22792 +                               unregister_trace_sched_wakeup_new(
22793 +                                   probe_wakeup_latency_hist_start, NULL);
22794 +                               return ret;
22795 +                       }
22796 +                       ret = register_trace_sched_migrate_task(
22797 +                           probe_sched_migrate_task, NULL);
22798 +                       if (ret) {
22799 +                               pr_info("wakeup trace: Couldn't assign "
22800 +                                   "probe_sched_migrate_task "
22801 +                                   "to trace_sched_migrate_task\n");
22802 +                               unregister_trace_sched_wakeup(
22803 +                                   probe_wakeup_latency_hist_start, NULL);
22804 +                               unregister_trace_sched_wakeup_new(
22805 +                                   probe_wakeup_latency_hist_start, NULL);
22806 +                               unregister_trace_sched_switch(
22807 +                                   probe_wakeup_latency_hist_stop, NULL);
22808 +                               return ret;
22809 +                       }
22810 +                       break;
22811 +#endif
22812 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22813 +               case MISSED_TIMER_OFFSETS:
22814 +                       ret = register_trace_hrtimer_interrupt(
22815 +                           probe_hrtimer_interrupt, NULL);
22816 +                       if (ret) {
22817 +                               pr_info("wakeup trace: Couldn't assign "
22818 +                                   "probe_hrtimer_interrupt "
22819 +                                   "to trace_hrtimer_interrupt\n");
22820 +                               return ret;
22821 +                       }
22822 +                       break;
22823 +#endif
22824 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22825 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22826 +               case TIMERANDWAKEUP_LATENCY:
22827 +                       if (!wakeup_latency_enabled_data.enabled ||
22828 +                           !missed_timer_offsets_enabled_data.enabled)
22829 +                               return -EINVAL;
22830 +                       break;
22831 +#endif
22832 +               default:
22833 +                       break;
22834 +               }
22835 +       } else {
22836 +               switch (ed->latency_type) {
22837 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22838 +               case PREEMPTIRQSOFF_LATENCY:
22839 +                       {
22840 +                               int cpu;
22841 +
22842 +                               unregister_trace_preemptirqsoff_hist(
22843 +                                   probe_preemptirqsoff_hist, NULL);
22844 +                               for_each_online_cpu(cpu) {
22845 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22846 +                                       per_cpu(hist_irqsoff_counting,
22847 +                                           cpu) = 0;
22848 +#endif
22849 +#ifdef CONFIG_PREEMPT_OFF_HIST
22850 +                                       per_cpu(hist_preemptoff_counting,
22851 +                                           cpu) = 0;
22852 +#endif
22853 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22854 +                                       per_cpu(hist_preemptirqsoff_counting,
22855 +                                           cpu) = 0;
22856 +#endif
22857 +                               }
22858 +                       }
22859 +                       break;
22860 +#endif
22861 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22862 +               case WAKEUP_LATENCY:
22863 +                       {
22864 +                               int cpu;
22865 +
22866 +                               unregister_trace_sched_wakeup(
22867 +                                   probe_wakeup_latency_hist_start, NULL);
22868 +                               unregister_trace_sched_wakeup_new(
22869 +                                   probe_wakeup_latency_hist_start, NULL);
22870 +                               unregister_trace_sched_switch(
22871 +                                   probe_wakeup_latency_hist_stop, NULL);
22872 +                               unregister_trace_sched_migrate_task(
22873 +                                   probe_sched_migrate_task, NULL);
22874 +
22875 +                               for_each_online_cpu(cpu) {
22876 +                                       per_cpu(wakeup_task, cpu) = NULL;
22877 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
22878 +                               }
22879 +                       }
22880 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22881 +                       timerandwakeup_enabled_data.enabled = 0;
22882 +#endif
22883 +                       break;
22884 +#endif
22885 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22886 +               case MISSED_TIMER_OFFSETS:
22887 +                       unregister_trace_hrtimer_interrupt(
22888 +                           probe_hrtimer_interrupt, NULL);
22889 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22890 +                       timerandwakeup_enabled_data.enabled = 0;
22891 +#endif
22892 +                       break;
22893 +#endif
22894 +               default:
22895 +                       break;
22896 +               }
22897 +       }
22898 +       ed->enabled = enable;
22899 +       return cnt;
22900 +}
22901 +
22902 +static const struct file_operations latency_hist_reset_fops = {
22903 +       .open = tracing_open_generic,
22904 +       .write = latency_hist_reset,
22905 +};
22906 +
22907 +static const struct file_operations enable_fops = {
22908 +       .open = tracing_open_generic,
22909 +       .read = show_enable,
22910 +       .write = do_enable,
22911 +};
22912 +
22913 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22914 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22915 +static const struct file_operations pid_fops = {
22916 +       .open = tracing_open_generic,
22917 +       .read = show_pid,
22918 +       .write = do_pid,
22919 +};
22920 +
22921 +static const struct file_operations maxlatproc_fops = {
22922 +       .open = tracing_open_generic,
22923 +       .read = show_maxlatproc,
22924 +};
22925 +#endif
22926 +
22927 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22928 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
22929 +       int starthist)
22930 +{
22931 +       int cpu = raw_smp_processor_id();
22932 +       int time_set = 0;
22933 +
22934 +       if (starthist) {
22935 +               cycle_t uninitialized_var(start);
22936 +
22937 +               if (!preempt_count() && !irqs_disabled())
22938 +                       return;
22939 +
22940 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22941 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
22942 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
22943 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
22944 +                       start = ftrace_now(cpu);
22945 +                       time_set++;
22946 +                       per_cpu(hist_irqsoff_start, cpu) = start;
22947 +               }
22948 +#endif
22949 +
22950 +#ifdef CONFIG_PREEMPT_OFF_HIST
22951 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
22952 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
22953 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
22954 +                       if (!(time_set++))
22955 +                               start = ftrace_now(cpu);
22956 +                       per_cpu(hist_preemptoff_start, cpu) = start;
22957 +               }
22958 +#endif
22959 +
22960 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22961 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
22962 +                   per_cpu(hist_preemptoff_counting, cpu) &&
22963 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
22964 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
22965 +                       if (!time_set)
22966 +                               start = ftrace_now(cpu);
22967 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
22968 +               }
22969 +#endif
22970 +       } else {
22971 +               cycle_t uninitialized_var(stop);
22972 +
22973 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22974 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
22975 +                   per_cpu(hist_irqsoff_counting, cpu)) {
22976 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
22977 +
22978 +                       stop = ftrace_now(cpu);
22979 +                       time_set++;
22980 +                       if (start) {
22981 +                               long latency = ((long) (stop - start)) /
22982 +                                   NSECS_PER_USECS;
22983 +
22984 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
22985 +                                   stop, NULL);
22986 +                       }
22987 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
22988 +               }
22989 +#endif
22990 +
22991 +#ifdef CONFIG_PREEMPT_OFF_HIST
22992 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
22993 +                   per_cpu(hist_preemptoff_counting, cpu)) {
22994 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
22995 +
22996 +                       if (!(time_set++))
22997 +                               stop = ftrace_now(cpu);
22998 +                       if (start) {
22999 +                               long latency = ((long) (stop - start)) /
23000 +                                   NSECS_PER_USECS;
23001 +
23002 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
23003 +                                   0, stop, NULL);
23004 +                       }
23005 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
23006 +               }
23007 +#endif
23008 +
23009 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
23010 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
23011 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
23012 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
23013 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
23014 +
23015 +                       if (!time_set)
23016 +                               stop = ftrace_now(cpu);
23017 +                       if (start) {
23018 +                               long latency = ((long) (stop - start)) /
23019 +                                   NSECS_PER_USECS;
23020 +
23021 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
23022 +                                   latency, 0, stop, NULL);
23023 +                       }
23024 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
23025 +               }
23026 +#endif
23027 +       }
23028 +}
23029 +#endif
23030 +
23031 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23032 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
23033 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
23034 +       int cpu)
23035 +{
23036 +       int old_cpu = task_cpu(task);
23037 +
23038 +       if (cpu != old_cpu) {
23039 +               unsigned long flags;
23040 +               struct task_struct *cpu_wakeup_task;
23041 +
23042 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
23043 +
23044 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
23045 +               if (task == cpu_wakeup_task) {
23046 +                       put_task_struct(cpu_wakeup_task);
23047 +                       per_cpu(wakeup_task, old_cpu) = NULL;
23048 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
23049 +                       get_task_struct(cpu_wakeup_task);
23050 +               }
23051 +
23052 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
23053 +       }
23054 +}
23055 +
23056 +static notrace void probe_wakeup_latency_hist_start(void *v,
23057 +       struct task_struct *p)
23058 +{
23059 +       unsigned long flags;
23060 +       struct task_struct *curr = current;
23061 +       int cpu = task_cpu(p);
23062 +       struct task_struct *cpu_wakeup_task;
23063 +
23064 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
23065 +
23066 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
23067 +
23068 +       if (wakeup_pid) {
23069 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
23070 +                   p->prio == curr->prio)
23071 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
23072 +               if (likely(wakeup_pid != task_pid_nr(p)))
23073 +                       goto out;
23074 +       } else {
23075 +               if (likely(!rt_task(p)) ||
23076 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
23077 +                   p->prio > curr->prio)
23078 +                       goto out;
23079 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
23080 +                   p->prio == curr->prio)
23081 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
23082 +       }
23083 +
23084 +       if (cpu_wakeup_task)
23085 +               put_task_struct(cpu_wakeup_task);
23086 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
23087 +       get_task_struct(cpu_wakeup_task);
23088 +       cpu_wakeup_task->preempt_timestamp_hist =
23089 +               ftrace_now(raw_smp_processor_id());
23090 +out:
23091 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
23092 +}
23093 +
23094 +static notrace void probe_wakeup_latency_hist_stop(void *v,
23095 +       bool preempt, struct task_struct *prev, struct task_struct *next)
23096 +{
23097 +       unsigned long flags;
23098 +       int cpu = task_cpu(next);
23099 +       long latency;
23100 +       cycle_t stop;
23101 +       struct task_struct *cpu_wakeup_task;
23102 +
23103 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
23104 +
23105 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
23106 +
23107 +       if (cpu_wakeup_task == NULL)
23108 +               goto out;
23109 +
23110 +       /* Already running? */
23111 +       if (unlikely(current == cpu_wakeup_task))
23112 +               goto out_reset;
23113 +
23114 +       if (next != cpu_wakeup_task) {
23115 +               if (next->prio < cpu_wakeup_task->prio)
23116 +                       goto out_reset;
23117 +
23118 +               if (next->prio == cpu_wakeup_task->prio)
23119 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
23120 +
23121 +               goto out;
23122 +       }
23123 +
23124 +       if (current->prio == cpu_wakeup_task->prio)
23125 +               per_cpu(wakeup_sharedprio, cpu) = 1;
23126 +
23127 +       /*
23128 +        * The task we are waiting for is about to be switched to.
23129 +        * Calculate latency and store it in histogram.
23130 +        */
23131 +       stop = ftrace_now(raw_smp_processor_id());
23132 +
23133 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
23134 +           NSECS_PER_USECS;
23135 +
23136 +       if (per_cpu(wakeup_sharedprio, cpu)) {
23137 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
23138 +                   next);
23139 +               per_cpu(wakeup_sharedprio, cpu) = 0;
23140 +       } else {
23141 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
23142 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23143 +               if (timerandwakeup_enabled_data.enabled) {
23144 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
23145 +                           next->timer_offset + latency, next->timer_offset,
23146 +                           stop, next);
23147 +               }
23148 +#endif
23149 +       }
23150 +
23151 +out_reset:
23152 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23153 +       next->timer_offset = 0;
23154 +#endif
23155 +       put_task_struct(cpu_wakeup_task);
23156 +       per_cpu(wakeup_task, cpu) = NULL;
23157 +out:
23158 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
23159 +}
23160 +#endif
23161 +
23162 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23163 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
23164 +       long long latency_ns, struct task_struct *curr,
23165 +       struct task_struct *task)
23166 +{
23167 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
23168 +           (task->prio < curr->prio ||
23169 +           (task->prio == curr->prio &&
23170 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
23171 +               long latency;
23172 +               cycle_t now;
23173 +
23174 +               if (missed_timer_offsets_pid) {
23175 +                       if (likely(missed_timer_offsets_pid !=
23176 +                           task_pid_nr(task)))
23177 +                               return;
23178 +               }
23179 +
23180 +               now = ftrace_now(cpu);
23181 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
23182 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
23183 +                   task);
23184 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23185 +               task->timer_offset = latency;
23186 +#endif
23187 +       }
23188 +}
23189 +#endif
23190 +
23191 +static __init int latency_hist_init(void)
23192 +{
23193 +       struct dentry *latency_hist_root = NULL;
23194 +       struct dentry *dentry;
23195 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23196 +       struct dentry *dentry_sharedprio;
23197 +#endif
23198 +       struct dentry *entry;
23199 +       struct dentry *enable_root;
23200 +       int i = 0;
23201 +       struct hist_data *my_hist;
23202 +       char name[64];
23203 +       char *cpufmt = "CPU%d";
23204 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
23205 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
23206 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
23207 +       struct maxlatproc_data *mp = NULL;
23208 +#endif
23209 +
23210 +       dentry = tracing_init_dentry();
23211 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
23212 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
23213 +
23214 +#ifdef CONFIG_INTERRUPT_OFF_HIST
23215 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
23216 +       for_each_possible_cpu(i) {
23217 +               sprintf(name, cpufmt, i);
23218 +               entry = debugfs_create_file(name, 0444, dentry,
23219 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
23220 +               my_hist = &per_cpu(irqsoff_hist, i);
23221 +               atomic_set(&my_hist->hist_mode, 1);
23222 +               my_hist->min_lat = LONG_MAX;
23223 +       }
23224 +       entry = debugfs_create_file("reset", 0644, dentry,
23225 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
23226 +#endif
23227 +
23228 +#ifdef CONFIG_PREEMPT_OFF_HIST
23229 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
23230 +           latency_hist_root);
23231 +       for_each_possible_cpu(i) {
23232 +               sprintf(name, cpufmt, i);
23233 +               entry = debugfs_create_file(name, 0444, dentry,
23234 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
23235 +               my_hist = &per_cpu(preemptoff_hist, i);
23236 +               atomic_set(&my_hist->hist_mode, 1);
23237 +               my_hist->min_lat = LONG_MAX;
23238 +       }
23239 +       entry = debugfs_create_file("reset", 0644, dentry,
23240 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
23241 +#endif
23242 +
23243 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
23244 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
23245 +           latency_hist_root);
23246 +       for_each_possible_cpu(i) {
23247 +               sprintf(name, cpufmt, i);
23248 +               entry = debugfs_create_file(name, 0444, dentry,
23249 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
23250 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
23251 +               atomic_set(&my_hist->hist_mode, 1);
23252 +               my_hist->min_lat = LONG_MAX;
23253 +       }
23254 +       entry = debugfs_create_file("reset", 0644, dentry,
23255 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
23256 +#endif
23257 +
23258 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
23259 +       entry = debugfs_create_file("preemptirqsoff", 0644,
23260 +           enable_root, (void *)&preemptirqsoff_enabled_data,
23261 +           &enable_fops);
23262 +#endif
23263 +
23264 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
23265 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
23266 +           latency_hist_root);
23267 +       dentry_sharedprio = debugfs_create_dir(
23268 +           wakeup_latency_hist_dir_sharedprio, dentry);
23269 +       for_each_possible_cpu(i) {
23270 +               sprintf(name, cpufmt, i);
23271 +
23272 +               entry = debugfs_create_file(name, 0444, dentry,
23273 +                   &per_cpu(wakeup_latency_hist, i),
23274 +                   &latency_hist_fops);
23275 +               my_hist = &per_cpu(wakeup_latency_hist, i);
23276 +               atomic_set(&my_hist->hist_mode, 1);
23277 +               my_hist->min_lat = LONG_MAX;
23278 +
23279 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
23280 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
23281 +                   &latency_hist_fops);
23282 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
23283 +               atomic_set(&my_hist->hist_mode, 1);
23284 +               my_hist->min_lat = LONG_MAX;
23285 +
23286 +               sprintf(name, cpufmt_maxlatproc, i);
23287 +
23288 +               mp = &per_cpu(wakeup_maxlatproc, i);
23289 +               entry = debugfs_create_file(name, 0444, dentry, mp,
23290 +                   &maxlatproc_fops);
23291 +               clear_maxlatprocdata(mp);
23292 +
23293 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
23294 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
23295 +                   &maxlatproc_fops);
23296 +               clear_maxlatprocdata(mp);
23297 +       }
23298 +       entry = debugfs_create_file("pid", 0644, dentry,
23299 +           (void *)&wakeup_pid, &pid_fops);
23300 +       entry = debugfs_create_file("reset", 0644, dentry,
23301 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
23302 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
23303 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
23304 +       entry = debugfs_create_file("wakeup", 0644,
23305 +           enable_root, (void *)&wakeup_latency_enabled_data,
23306 +           &enable_fops);
23307 +#endif
23308 +
23309 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
23310 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
23311 +           latency_hist_root);
23312 +       for_each_possible_cpu(i) {
23313 +               sprintf(name, cpufmt, i);
23314 +               entry = debugfs_create_file(name, 0444, dentry,
23315 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
23316 +               my_hist = &per_cpu(missed_timer_offsets, i);
23317 +               atomic_set(&my_hist->hist_mode, 1);
23318 +               my_hist->min_lat = LONG_MAX;
23319 +
23320 +               sprintf(name, cpufmt_maxlatproc, i);
23321 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
23322 +               entry = debugfs_create_file(name, 0444, dentry, mp,
23323 +                   &maxlatproc_fops);
23324 +               clear_maxlatprocdata(mp);
23325 +       }
23326 +       entry = debugfs_create_file("pid", 0644, dentry,
23327 +           (void *)&missed_timer_offsets_pid, &pid_fops);
23328 +       entry = debugfs_create_file("reset", 0644, dentry,
23329 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
23330 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
23331 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
23332 +           &enable_fops);
23333 +#endif
23334 +
23335 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
23336 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
23337 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
23338 +           latency_hist_root);
23339 +       for_each_possible_cpu(i) {
23340 +               sprintf(name, cpufmt, i);
23341 +               entry = debugfs_create_file(name, 0444, dentry,
23342 +                   &per_cpu(timerandwakeup_latency_hist, i),
23343 +                   &latency_hist_fops);
23344 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
23345 +               atomic_set(&my_hist->hist_mode, 1);
23346 +               my_hist->min_lat = LONG_MAX;
23347 +
23348 +               sprintf(name, cpufmt_maxlatproc, i);
23349 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
23350 +               entry = debugfs_create_file(name, 0444, dentry, mp,
23351 +                   &maxlatproc_fops);
23352 +               clear_maxlatprocdata(mp);
23353 +       }
23354 +       entry = debugfs_create_file("reset", 0644, dentry,
23355 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
23356 +       entry = debugfs_create_file("timerandwakeup", 0644,
23357 +           enable_root, (void *)&timerandwakeup_enabled_data,
23358 +           &enable_fops);
23359 +#endif
23360 +       return 0;
23361 +}
23362 +
23363 +device_initcall(latency_hist_init);
23364 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
23365 index c1e50cc0d7b0..035e61604455 100644
23366 --- a/kernel/trace/trace.c
23367 +++ b/kernel/trace/trace.c
23368 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
23369         struct task_struct *tsk = current;
23370
23371         entry->preempt_count            = pc & 0xff;
23372 +       entry->preempt_lazy_count       = preempt_lazy_count();
23373         entry->pid                      = (tsk) ? tsk->pid : 0;
23374         entry->flags =
23375  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
23376 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
23377                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
23378                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
23379                 ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
23380 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
23381 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
23382 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
23383                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
23384 +
23385 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
23386  }
23387  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
23388
23389 @@ -2898,14 +2902,17 @@ get_total_entries(struct trace_buffer *buf,
23390
23391  static void print_lat_help_header(struct seq_file *m)
23392  {
23393 -       seq_puts(m, "#                  _------=> CPU#            \n"
23394 -                   "#                 / _-----=> irqs-off        \n"
23395 -                   "#                | / _----=> need-resched    \n"
23396 -                   "#                || / _---=> hardirq/softirq \n"
23397 -                   "#                ||| / _--=> preempt-depth   \n"
23398 -                   "#                |||| /     delay            \n"
23399 -                   "#  cmd     pid   ||||| time  |   caller      \n"
23400 -                   "#     \\   /      |||||  \\    |   /         \n");
23401 +       seq_puts(m, "#                  _--------=> CPU#              \n"
23402 +                   "#                 / _-------=> irqs-off          \n"
23403 +                   "#                | / _------=> need-resched      \n"
23404 +                   "#                || / _-----=> need-resched_lazy \n"
23405 +                   "#                ||| / _----=> hardirq/softirq   \n"
23406 +                   "#                |||| / _---=> preempt-depth     \n"
23407 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
23408 +                   "#                |||||| / _-=> migrate-disable   \n"
23409 +                   "#                ||||||| /     delay             \n"
23410 +                   "# cmd     pid    |||||||| time   |  caller       \n"
23411 +                   "#     \\   /      ||||||||   \\    |  /            \n");
23412  }
23413
23414  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
23415 @@ -2931,11 +2938,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
23416         print_event_info(buf, m);
23417         seq_puts(m, "#                              _-----=> irqs-off\n"
23418                     "#                             / _----=> need-resched\n"
23419 -                   "#                            | / _---=> hardirq/softirq\n"
23420 -                   "#                            || / _--=> preempt-depth\n"
23421 -                   "#                            ||| /     delay\n"
23422 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
23423 -                   "#              | |       |   ||||       |         |\n");
23424 +                   "#                            |/  _-----=> need-resched_lazy\n"
23425 +                   "#                            || / _---=> hardirq/softirq\n"
23426 +                   "#                            ||| / _--=> preempt-depth\n"
23427 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
23428 +                   "#                            ||||| / _-=> migrate-disable   \n"
23429 +                   "#                            |||||| /    delay\n"
23430 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
23431 +                   "#              | |       |   |||||||      |         |\n");
23432  }
23433
23434  void
23435 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
23436 index b0d8576c27ae..702b9376b278 100644
23437 --- a/kernel/trace/trace.h
23438 +++ b/kernel/trace/trace.h
23439 @@ -124,6 +124,7 @@ struct kretprobe_trace_entry_head {
23440   *  NEED_RESCHED       - reschedule is requested
23441   *  HARDIRQ            - inside an interrupt handler
23442   *  SOFTIRQ            - inside a softirq handler
23443 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
23444   */
23445  enum trace_flag_type {
23446         TRACE_FLAG_IRQS_OFF             = 0x01,
23447 @@ -133,6 +134,7 @@ enum trace_flag_type {
23448         TRACE_FLAG_SOFTIRQ              = 0x10,
23449         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
23450         TRACE_FLAG_NMI                  = 0x40,
23451 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
23452  };
23453
23454  #define TRACE_BUF_SIZE         1024
23455 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
23456 index 03c0a48c3ac4..0b85d516b491 100644
23457 --- a/kernel/trace/trace_events.c
23458 +++ b/kernel/trace/trace_events.c
23459 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
23460         __common_field(unsigned char, flags);
23461         __common_field(unsigned char, preempt_count);
23462         __common_field(int, pid);
23463 +       __common_field(unsigned short, migrate_disable);
23464 +       __common_field(unsigned short, padding);
23465
23466         return ret;
23467  }
23468 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
23469 index 03cdff84d026..940bd10b4406 100644
23470 --- a/kernel/trace/trace_irqsoff.c
23471 +++ b/kernel/trace/trace_irqsoff.c
23472 @@ -13,6 +13,7 @@
23473  #include <linux/uaccess.h>
23474  #include <linux/module.h>
23475  #include <linux/ftrace.h>
23476 +#include <trace/events/hist.h>
23477
23478  #include "trace.h"
23479
23480 @@ -424,11 +425,13 @@ void start_critical_timings(void)
23481  {
23482         if (preempt_trace() || irq_trace())
23483                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23484 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
23485  }
23486  EXPORT_SYMBOL_GPL(start_critical_timings);
23487
23488  void stop_critical_timings(void)
23489  {
23490 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
23491         if (preempt_trace() || irq_trace())
23492                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23493  }
23494 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
23495  #ifdef CONFIG_PROVE_LOCKING
23496  void time_hardirqs_on(unsigned long a0, unsigned long a1)
23497  {
23498 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
23499         if (!preempt_trace() && irq_trace())
23500                 stop_critical_timing(a0, a1);
23501  }
23502 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
23503  {
23504         if (!preempt_trace() && irq_trace())
23505                 start_critical_timing(a0, a1);
23506 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
23507  }
23508
23509  #else /* !CONFIG_PROVE_LOCKING */
23510 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
23511   */
23512  void trace_hardirqs_on(void)
23513  {
23514 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
23515         if (!preempt_trace() && irq_trace())
23516                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23517  }
23518 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
23519  {
23520         if (!preempt_trace() && irq_trace())
23521                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
23522 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
23523  }
23524  EXPORT_SYMBOL(trace_hardirqs_off);
23525
23526  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
23527  {
23528 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
23529         if (!preempt_trace() && irq_trace())
23530                 stop_critical_timing(CALLER_ADDR0, caller_addr);
23531  }
23532 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
23533  {
23534         if (!preempt_trace() && irq_trace())
23535                 start_critical_timing(CALLER_ADDR0, caller_addr);
23536 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
23537  }
23538  EXPORT_SYMBOL(trace_hardirqs_off_caller);
23539
23540 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
23541  #ifdef CONFIG_PREEMPT_TRACER
23542  void trace_preempt_on(unsigned long a0, unsigned long a1)
23543  {
23544 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
23545         if (preempt_trace() && !irq_trace())
23546                 stop_critical_timing(a0, a1);
23547  }
23548
23549  void trace_preempt_off(unsigned long a0, unsigned long a1)
23550  {
23551 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
23552         if (preempt_trace() && !irq_trace())
23553                 start_critical_timing(a0, a1);
23554  }
23555 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
23556 index 3fc20422c166..65a6dde71a7d 100644
23557 --- a/kernel/trace/trace_output.c
23558 +++ b/kernel/trace/trace_output.c
23559 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
23560  {
23561         char hardsoft_irq;
23562         char need_resched;
23563 +       char need_resched_lazy;
23564         char irqs_off;
23565         int hardirq;
23566         int softirq;
23567 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
23568                 break;
23569         }
23570
23571 +       need_resched_lazy =
23572 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
23573 +
23574         hardsoft_irq =
23575                 (nmi && hardirq)     ? 'Z' :
23576                 nmi                  ? 'z' :
23577 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
23578                 softirq              ? 's' :
23579                                        '.' ;
23580
23581 -       trace_seq_printf(s, "%c%c%c",
23582 -                        irqs_off, need_resched, hardsoft_irq);
23583 +       trace_seq_printf(s, "%c%c%c%c",
23584 +                        irqs_off, need_resched, need_resched_lazy,
23585 +                        hardsoft_irq);
23586
23587         if (entry->preempt_count)
23588                 trace_seq_printf(s, "%x", entry->preempt_count);
23589         else
23590                 trace_seq_putc(s, '.');
23591
23592 +       if (entry->preempt_lazy_count)
23593 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
23594 +       else
23595 +               trace_seq_putc(s, '.');
23596 +
23597 +       if (entry->migrate_disable)
23598 +               trace_seq_printf(s, "%x", entry->migrate_disable);
23599 +       else
23600 +               trace_seq_putc(s, '.');
23601 +
23602         return !trace_seq_has_overflowed(s);
23603  }
23604
23605 diff --git a/kernel/user.c b/kernel/user.c
23606 index b069ccbfb0b0..1a2e88e98b5e 100644
23607 --- a/kernel/user.c
23608 +++ b/kernel/user.c
23609 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
23610         if (!up)
23611                 return;
23612
23613 -       local_irq_save(flags);
23614 +       local_irq_save_nort(flags);
23615         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
23616                 free_user(up, flags);
23617         else
23618 -               local_irq_restore(flags);
23619 +               local_irq_restore_nort(flags);
23620  }
23621
23622  struct user_struct *alloc_uid(kuid_t uid)
23623 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
23624 index 63177be0159e..59fe007ad496 100644
23625 --- a/kernel/watchdog.c
23626 +++ b/kernel/watchdog.c
23627 @@ -381,6 +381,7 @@ static void watchdog_enable(unsigned int cpu)
23628         /* kick off the timer for the hardlockup detector */
23629         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23630         hrtimer->function = watchdog_timer_fn;
23631 +       hrtimer->irqsafe = 1;
23632
23633         /* Enable the perf event */
23634         watchdog_nmi_enable(cpu);
23635 diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
23636 index 12b8dd640786..4c90d2ee7433 100644
23637 --- a/kernel/watchdog_hld.c
23638 +++ b/kernel/watchdog_hld.c
23639 @@ -19,6 +19,7 @@
23640  static DEFINE_PER_CPU(bool, hard_watchdog_warn);
23641  static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
23642  static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
23643 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
23644
23645  /* boot commands */
23646  /*
23647 @@ -104,6 +105,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
23648                 /* only print hardlockups once */
23649                 if (__this_cpu_read(hard_watchdog_warn) == true)
23650                         return;
23651 +               /*
23652 +                * If early-printk is enabled then make sure we do not
23653 +                * lock up in printk() and kill console logging:
23654 +                */
23655 +               printk_kill();
23656 +
23657 +               raw_spin_lock(&watchdog_output_lock);
23658
23659                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
23660                 print_modules();
23661 @@ -121,6 +129,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
23662                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
23663                         trigger_allbutself_cpu_backtrace();
23664
23665 +               raw_spin_unlock(&watchdog_output_lock);
23666                 if (hardlockup_panic)
23667                         nmi_panic(regs, "Hard LOCKUP");
23668
23669 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
23670 index 296dcca77f33..beee8ad4b9ea 100644
23671 --- a/kernel/workqueue.c
23672 +++ b/kernel/workqueue.c
23673 @@ -48,6 +48,8 @@
23674  #include <linux/nodemask.h>
23675  #include <linux/moduleparam.h>
23676  #include <linux/uaccess.h>
23677 +#include <linux/locallock.h>
23678 +#include <linux/delay.h>
23679
23680  #include "workqueue_internal.h"
23681
23682 @@ -122,11 +124,16 @@ enum {
23683   *    cpu or grabbing pool->lock is enough for read access.  If
23684   *    POOL_DISASSOCIATED is set, it's identical to L.
23685   *
23686 + *    On RT we need the extra protection via rt_lock_idle_list() for
23687 + *    the list manipulations against read access from
23688 + *    wq_worker_sleeping(). All other places are nicely serialized via
23689 + *    pool->lock.
23690 + *
23691   * A: pool->attach_mutex protected.
23692   *
23693   * PL: wq_pool_mutex protected.
23694   *
23695 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
23696 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
23697   *
23698   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
23699   *
23700 @@ -135,7 +142,7 @@ enum {
23701   *
23702   * WQ: wq->mutex protected.
23703   *
23704 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
23705 + * WR: wq->mutex protected for writes.  RCU protected for reads.
23706   *
23707   * MD: wq_mayday_lock protected.
23708   */
23709 @@ -185,7 +192,7 @@ struct worker_pool {
23710         atomic_t                nr_running ____cacheline_aligned_in_smp;
23711
23712         /*
23713 -        * Destruction of pool is sched-RCU protected to allow dereferences
23714 +        * Destruction of pool is RCU protected to allow dereferences
23715          * from get_work_pool().
23716          */
23717         struct rcu_head         rcu;
23718 @@ -214,7 +221,7 @@ struct pool_workqueue {
23719         /*
23720          * Release of unbound pwq is punted to system_wq.  See put_pwq()
23721          * and pwq_unbound_release_workfn() for details.  pool_workqueue
23722 -        * itself is also sched-RCU protected so that the first pwq can be
23723 +        * itself is also RCU protected so that the first pwq can be
23724          * determined without grabbing wq->mutex.
23725          */
23726         struct work_struct      unbound_release_work;
23727 @@ -349,6 +356,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
23728  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
23729  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
23730
23731 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
23732 +
23733  static int worker_thread(void *__worker);
23734  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23735
23736 @@ -356,20 +365,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23737  #include <trace/events/workqueue.h>
23738
23739  #define assert_rcu_or_pool_mutex()                                     \
23740 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23741 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23742                          !lockdep_is_held(&wq_pool_mutex),              \
23743 -                        "sched RCU or wq_pool_mutex should be held")
23744 +                        "RCU or wq_pool_mutex should be held")
23745
23746  #define assert_rcu_or_wq_mutex(wq)                                     \
23747 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23748 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23749                          !lockdep_is_held(&wq->mutex),                  \
23750 -                        "sched RCU or wq->mutex should be held")
23751 +                        "RCU or wq->mutex should be held")
23752
23753  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
23754 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23755 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23756                          !lockdep_is_held(&wq->mutex) &&                \
23757                          !lockdep_is_held(&wq_pool_mutex),              \
23758 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
23759 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
23760
23761  #define for_each_cpu_worker_pool(pool, cpu)                            \
23762         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
23763 @@ -381,7 +390,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23764   * @pool: iteration cursor
23765   * @pi: integer used for iteration
23766   *
23767 - * This must be called either with wq_pool_mutex held or sched RCU read
23768 + * This must be called either with wq_pool_mutex held or RCU read
23769   * locked.  If the pool needs to be used beyond the locking in effect, the
23770   * caller is responsible for guaranteeing that the pool stays online.
23771   *
23772 @@ -413,7 +422,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23773   * @pwq: iteration cursor
23774   * @wq: the target workqueue
23775   *
23776 - * This must be called either with wq->mutex held or sched RCU read locked.
23777 + * This must be called either with wq->mutex held or RCU read locked.
23778   * If the pwq needs to be used beyond the locking in effect, the caller is
23779   * responsible for guaranteeing that the pwq stays online.
23780   *
23781 @@ -425,6 +434,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23782                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
23783                 else
23784
23785 +#ifdef CONFIG_PREEMPT_RT_BASE
23786 +static inline void rt_lock_idle_list(struct worker_pool *pool)
23787 +{
23788 +       preempt_disable();
23789 +}
23790 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
23791 +{
23792 +       preempt_enable();
23793 +}
23794 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
23795 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
23796 +#else
23797 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
23798 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
23799 +static inline void sched_lock_idle_list(struct worker_pool *pool)
23800 +{
23801 +       spin_lock_irq(&pool->lock);
23802 +}
23803 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
23804 +{
23805 +       spin_unlock_irq(&pool->lock);
23806 +}
23807 +#endif
23808 +
23809 +
23810  #ifdef CONFIG_DEBUG_OBJECTS_WORK
23811
23812  static struct debug_obj_descr work_debug_descr;
23813 @@ -549,7 +583,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
23814   * @wq: the target workqueue
23815   * @node: the node ID
23816   *
23817 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
23818 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
23819   * read locked.
23820   * If the pwq needs to be used beyond the locking in effect, the caller is
23821   * responsible for guaranteeing that the pwq stays online.
23822 @@ -693,8 +727,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
23823   * @work: the work item of interest
23824   *
23825   * Pools are created and destroyed under wq_pool_mutex, and allows read
23826 - * access under sched-RCU read lock.  As such, this function should be
23827 - * called under wq_pool_mutex or with preemption disabled.
23828 + * access under RCU read lock.  As such, this function should be
23829 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
23830   *
23831   * All fields of the returned pool are accessible as long as the above
23832   * mentioned locking is in effect.  If the returned pool needs to be used
23833 @@ -831,50 +865,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
23834   */
23835  static void wake_up_worker(struct worker_pool *pool)
23836  {
23837 -       struct worker *worker = first_idle_worker(pool);
23838 +       struct worker *worker;
23839 +
23840 +       rt_lock_idle_list(pool);
23841 +
23842 +       worker = first_idle_worker(pool);
23843
23844         if (likely(worker))
23845                 wake_up_process(worker->task);
23846 +
23847 +       rt_unlock_idle_list(pool);
23848  }
23849
23850  /**
23851 - * wq_worker_waking_up - a worker is waking up
23852 + * wq_worker_running - a worker is running again
23853   * @task: task waking up
23854 - * @cpu: CPU @task is waking up to
23855   *
23856 - * This function is called during try_to_wake_up() when a worker is
23857 - * being awoken.
23858 - *
23859 - * CONTEXT:
23860 - * spin_lock_irq(rq->lock)
23861 + * This function is called when a worker returns from schedule()
23862   */
23863 -void wq_worker_waking_up(struct task_struct *task, int cpu)
23864 +void wq_worker_running(struct task_struct *task)
23865  {
23866         struct worker *worker = kthread_data(task);
23867
23868 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
23869 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
23870 +       if (!worker->sleeping)
23871 +               return;
23872 +       if (!(worker->flags & WORKER_NOT_RUNNING))
23873                 atomic_inc(&worker->pool->nr_running);
23874 -       }
23875 +       worker->sleeping = 0;
23876  }
23877
23878  /**
23879   * wq_worker_sleeping - a worker is going to sleep
23880   * @task: task going to sleep
23881   *
23882 - * This function is called during schedule() when a busy worker is
23883 - * going to sleep.  Worker on the same cpu can be woken up by
23884 - * returning pointer to its task.
23885 - *
23886 - * CONTEXT:
23887 - * spin_lock_irq(rq->lock)
23888 - *
23889 - * Return:
23890 - * Worker task on @cpu to wake up, %NULL if none.
23891 + * This function is called from schedule() when a busy worker is
23892 + * going to sleep.
23893   */
23894 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
23895 +void wq_worker_sleeping(struct task_struct *task)
23896  {
23897 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
23898 +       struct worker *worker = kthread_data(task);
23899         struct worker_pool *pool;
23900
23901         /*
23902 @@ -883,29 +912,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
23903          * checking NOT_RUNNING.
23904          */
23905         if (worker->flags & WORKER_NOT_RUNNING)
23906 -               return NULL;
23907 +               return;
23908
23909         pool = worker->pool;
23910
23911 -       /* this can only happen on the local cpu */
23912 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
23913 -               return NULL;
23914 +       if (WARN_ON_ONCE(worker->sleeping))
23915 +               return;
23916 +
23917 +       worker->sleeping = 1;
23918
23919         /*
23920          * The counterpart of the following dec_and_test, implied mb,
23921          * worklist not empty test sequence is in insert_work().
23922          * Please read comment there.
23923 -        *
23924 -        * NOT_RUNNING is clear.  This means that we're bound to and
23925 -        * running on the local cpu w/ rq lock held and preemption
23926 -        * disabled, which in turn means that none else could be
23927 -        * manipulating idle_list, so dereferencing idle_list without pool
23928 -        * lock is safe.
23929          */
23930         if (atomic_dec_and_test(&pool->nr_running) &&
23931 -           !list_empty(&pool->worklist))
23932 -               to_wakeup = first_idle_worker(pool);
23933 -       return to_wakeup ? to_wakeup->task : NULL;
23934 +           !list_empty(&pool->worklist)) {
23935 +               sched_lock_idle_list(pool);
23936 +               wake_up_worker(pool);
23937 +               sched_unlock_idle_list(pool);
23938 +       }
23939  }
23940
23941  /**
23942 @@ -1099,12 +1125,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
23943  {
23944         if (pwq) {
23945                 /*
23946 -                * As both pwqs and pools are sched-RCU protected, the
23947 +                * As both pwqs and pools are RCU protected, the
23948                  * following lock operations are safe.
23949                  */
23950 -               spin_lock_irq(&pwq->pool->lock);
23951 +               rcu_read_lock();
23952 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
23953                 put_pwq(pwq);
23954 -               spin_unlock_irq(&pwq->pool->lock);
23955 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
23956 +               rcu_read_unlock();
23957         }
23958  }
23959
23960 @@ -1208,7 +1236,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23961         struct worker_pool *pool;
23962         struct pool_workqueue *pwq;
23963
23964 -       local_irq_save(*flags);
23965 +       local_lock_irqsave(pendingb_lock, *flags);
23966
23967         /* try to steal the timer if it exists */
23968         if (is_dwork) {
23969 @@ -1227,6 +1255,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23970         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
23971                 return 0;
23972
23973 +       rcu_read_lock();
23974         /*
23975          * The queueing is in progress, or it is already queued. Try to
23976          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
23977 @@ -1265,14 +1294,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23978                 set_work_pool_and_keep_pending(work, pool->id);
23979
23980                 spin_unlock(&pool->lock);
23981 +               rcu_read_unlock();
23982                 return 1;
23983         }
23984         spin_unlock(&pool->lock);
23985  fail:
23986 -       local_irq_restore(*flags);
23987 +       rcu_read_unlock();
23988 +       local_unlock_irqrestore(pendingb_lock, *flags);
23989         if (work_is_canceling(work))
23990                 return -ENOENT;
23991 -       cpu_relax();
23992 +       cpu_chill();
23993         return -EAGAIN;
23994  }
23995
23996 @@ -1374,7 +1405,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
23997          * queued or lose PENDING.  Grabbing PENDING and queueing should
23998          * happen with IRQ disabled.
23999          */
24000 -       WARN_ON_ONCE(!irqs_disabled());
24001 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
24002
24003         debug_work_activate(work);
24004
24005 @@ -1382,6 +1413,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
24006         if (unlikely(wq->flags & __WQ_DRAINING) &&
24007             WARN_ON_ONCE(!is_chained_work(wq)))
24008                 return;
24009 +       rcu_read_lock();
24010  retry:
24011         if (req_cpu == WORK_CPU_UNBOUND)
24012                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
24013 @@ -1438,10 +1470,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
24014         /* pwq determined, queue */
24015         trace_workqueue_queue_work(req_cpu, pwq, work);
24016
24017 -       if (WARN_ON(!list_empty(&work->entry))) {
24018 -               spin_unlock(&pwq->pool->lock);
24019 -               return;
24020 -       }
24021 +       if (WARN_ON(!list_empty(&work->entry)))
24022 +               goto out;
24023
24024         pwq->nr_in_flight[pwq->work_color]++;
24025         work_flags = work_color_to_flags(pwq->work_color);
24026 @@ -1459,7 +1489,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
24027
24028         insert_work(pwq, work, worklist, work_flags);
24029
24030 +out:
24031         spin_unlock(&pwq->pool->lock);
24032 +       rcu_read_unlock();
24033  }
24034
24035  /**
24036 @@ -1479,14 +1511,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
24037         bool ret = false;
24038         unsigned long flags;
24039
24040 -       local_irq_save(flags);
24041 +       local_lock_irqsave(pendingb_lock,flags);
24042
24043         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
24044                 __queue_work(cpu, wq, work);
24045                 ret = true;
24046         }
24047
24048 -       local_irq_restore(flags);
24049 +       local_unlock_irqrestore(pendingb_lock, flags);
24050         return ret;
24051  }
24052  EXPORT_SYMBOL(queue_work_on);
24053 @@ -1553,14 +1585,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
24054         unsigned long flags;
24055
24056         /* read the comment in __queue_work() */
24057 -       local_irq_save(flags);
24058 +       local_lock_irqsave(pendingb_lock, flags);
24059
24060         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
24061                 __queue_delayed_work(cpu, wq, dwork, delay);
24062                 ret = true;
24063         }
24064
24065 -       local_irq_restore(flags);
24066 +       local_unlock_irqrestore(pendingb_lock, flags);
24067         return ret;
24068  }
24069  EXPORT_SYMBOL(queue_delayed_work_on);
24070 @@ -1595,7 +1627,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
24071
24072         if (likely(ret >= 0)) {
24073                 __queue_delayed_work(cpu, wq, dwork, delay);
24074 -               local_irq_restore(flags);
24075 +               local_unlock_irqrestore(pendingb_lock, flags);
24076         }
24077
24078         /* -ENOENT from try_to_grab_pending() becomes %true */
24079 @@ -1628,7 +1660,9 @@ static void worker_enter_idle(struct worker *worker)
24080         worker->last_active = jiffies;
24081
24082         /* idle_list is LIFO */
24083 +       rt_lock_idle_list(pool);
24084         list_add(&worker->entry, &pool->idle_list);
24085 +       rt_unlock_idle_list(pool);
24086
24087         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
24088                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
24089 @@ -1661,7 +1695,9 @@ static void worker_leave_idle(struct worker *worker)
24090                 return;
24091         worker_clr_flags(worker, WORKER_IDLE);
24092         pool->nr_idle--;
24093 +       rt_lock_idle_list(pool);
24094         list_del_init(&worker->entry);
24095 +       rt_unlock_idle_list(pool);
24096  }
24097
24098  static struct worker *alloc_worker(int node)
24099 @@ -1827,7 +1863,9 @@ static void destroy_worker(struct worker *worker)
24100         pool->nr_workers--;
24101         pool->nr_idle--;
24102
24103 +       rt_lock_idle_list(pool);
24104         list_del_init(&worker->entry);
24105 +       rt_unlock_idle_list(pool);
24106         worker->flags |= WORKER_DIE;
24107         wake_up_process(worker->task);
24108  }
24109 @@ -2779,14 +2817,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
24110
24111         might_sleep();
24112
24113 -       local_irq_disable();
24114 +       rcu_read_lock();
24115         pool = get_work_pool(work);
24116         if (!pool) {
24117 -               local_irq_enable();
24118 +               rcu_read_unlock();
24119                 return false;
24120         }
24121
24122 -       spin_lock(&pool->lock);
24123 +       spin_lock_irq(&pool->lock);
24124         /* see the comment in try_to_grab_pending() with the same code */
24125         pwq = get_work_pwq(work);
24126         if (pwq) {
24127 @@ -2815,10 +2853,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
24128         else
24129                 lock_map_acquire_read(&pwq->wq->lockdep_map);
24130         lock_map_release(&pwq->wq->lockdep_map);
24131 -
24132 +       rcu_read_unlock();
24133         return true;
24134  already_gone:
24135         spin_unlock_irq(&pool->lock);
24136 +       rcu_read_unlock();
24137         return false;
24138  }
24139
24140 @@ -2905,7 +2944,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
24141
24142         /* tell other tasks trying to grab @work to back off */
24143         mark_work_canceling(work);
24144 -       local_irq_restore(flags);
24145 +       local_unlock_irqrestore(pendingb_lock, flags);
24146
24147         flush_work(work);
24148         clear_work_data(work);
24149 @@ -2960,10 +2999,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
24150   */
24151  bool flush_delayed_work(struct delayed_work *dwork)
24152  {
24153 -       local_irq_disable();
24154 +       local_lock_irq(pendingb_lock);
24155         if (del_timer_sync(&dwork->timer))
24156                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
24157 -       local_irq_enable();
24158 +       local_unlock_irq(pendingb_lock);
24159         return flush_work(&dwork->work);
24160  }
24161  EXPORT_SYMBOL(flush_delayed_work);
24162 @@ -2981,7 +3020,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
24163                 return false;
24164
24165         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
24166 -       local_irq_restore(flags);
24167 +       local_unlock_irqrestore(pendingb_lock, flags);
24168         return ret;
24169  }
24170
24171 @@ -3238,7 +3277,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
24172   * put_unbound_pool - put a worker_pool
24173   * @pool: worker_pool to put
24174   *
24175 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
24176 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
24177   * safe manner.  get_unbound_pool() calls this function on its failure path
24178   * and this function should be able to release pools which went through,
24179   * successfully or not, init_worker_pool().
24180 @@ -3292,8 +3331,8 @@ static void put_unbound_pool(struct worker_pool *pool)
24181         del_timer_sync(&pool->idle_timer);
24182         del_timer_sync(&pool->mayday_timer);
24183
24184 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
24185 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
24186 +       /* RCU protected to allow dereferences from get_work_pool() */
24187 +       call_rcu(&pool->rcu, rcu_free_pool);
24188  }
24189
24190  /**
24191 @@ -3400,14 +3439,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
24192         put_unbound_pool(pool);
24193         mutex_unlock(&wq_pool_mutex);
24194
24195 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
24196 +       call_rcu(&pwq->rcu, rcu_free_pwq);
24197
24198         /*
24199          * If we're the last pwq going away, @wq is already dead and no one
24200          * is gonna access it anymore.  Schedule RCU free.
24201          */
24202         if (is_last)
24203 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
24204 +               call_rcu(&wq->rcu, rcu_free_wq);
24205  }
24206
24207  /**
24208 @@ -4071,7 +4110,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
24209                  * The base ref is never dropped on per-cpu pwqs.  Directly
24210                  * schedule RCU free.
24211                  */
24212 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
24213 +               call_rcu(&wq->rcu, rcu_free_wq);
24214         } else {
24215                 /*
24216                  * We're the sole accessor of @wq at this point.  Directly
24217 @@ -4165,7 +4204,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
24218         struct pool_workqueue *pwq;
24219         bool ret;
24220
24221 -       rcu_read_lock_sched();
24222 +       rcu_read_lock();
24223 +       preempt_disable();
24224
24225         if (cpu == WORK_CPU_UNBOUND)
24226                 cpu = smp_processor_id();
24227 @@ -4176,7 +4216,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
24228                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
24229
24230         ret = !list_empty(&pwq->delayed_works);
24231 -       rcu_read_unlock_sched();
24232 +       preempt_enable();
24233 +       rcu_read_unlock();
24234
24235         return ret;
24236  }
24237 @@ -4202,15 +4243,15 @@ unsigned int work_busy(struct work_struct *work)
24238         if (work_pending(work))
24239                 ret |= WORK_BUSY_PENDING;
24240
24241 -       local_irq_save(flags);
24242 +       rcu_read_lock();
24243         pool = get_work_pool(work);
24244         if (pool) {
24245 -               spin_lock(&pool->lock);
24246 +               spin_lock_irqsave(&pool->lock, flags);
24247                 if (find_worker_executing_work(pool, work))
24248                         ret |= WORK_BUSY_RUNNING;
24249 -               spin_unlock(&pool->lock);
24250 +               spin_unlock_irqrestore(&pool->lock, flags);
24251         }
24252 -       local_irq_restore(flags);
24253 +       rcu_read_unlock();
24254
24255         return ret;
24256  }
24257 @@ -4399,7 +4440,7 @@ void show_workqueue_state(void)
24258         unsigned long flags;
24259         int pi;
24260
24261 -       rcu_read_lock_sched();
24262 +       rcu_read_lock();
24263
24264         pr_info("Showing busy workqueues and worker pools:\n");
24265
24266 @@ -4452,7 +4493,7 @@ void show_workqueue_state(void)
24267                 spin_unlock_irqrestore(&pool->lock, flags);
24268         }
24269
24270 -       rcu_read_unlock_sched();
24271 +       rcu_read_unlock();
24272  }
24273
24274  /*
24275 @@ -4790,16 +4831,16 @@ bool freeze_workqueues_busy(void)
24276                  * nr_active is monotonically decreasing.  It's safe
24277                  * to peek without lock.
24278                  */
24279 -               rcu_read_lock_sched();
24280 +               rcu_read_lock();
24281                 for_each_pwq(pwq, wq) {
24282                         WARN_ON_ONCE(pwq->nr_active < 0);
24283                         if (pwq->nr_active) {
24284                                 busy = true;
24285 -                               rcu_read_unlock_sched();
24286 +                               rcu_read_unlock();
24287                                 goto out_unlock;
24288                         }
24289                 }
24290 -               rcu_read_unlock_sched();
24291 +               rcu_read_unlock();
24292         }
24293  out_unlock:
24294         mutex_unlock(&wq_pool_mutex);
24295 @@ -4989,7 +5030,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
24296         const char *delim = "";
24297         int node, written = 0;
24298
24299 -       rcu_read_lock_sched();
24300 +       get_online_cpus();
24301 +       rcu_read_lock();
24302         for_each_node(node) {
24303                 written += scnprintf(buf + written, PAGE_SIZE - written,
24304                                      "%s%d:%d", delim, node,
24305 @@ -4997,7 +5039,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
24306                 delim = " ";
24307         }
24308         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
24309 -       rcu_read_unlock_sched();
24310 +       rcu_read_unlock();
24311 +       put_online_cpus();
24312
24313         return written;
24314  }
24315 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
24316 index 29fa81f0f51a..42d1e3974554 100644
24317 --- a/kernel/workqueue_internal.h
24318 +++ b/kernel/workqueue_internal.h
24319 @@ -44,6 +44,7 @@ struct worker {
24320         unsigned long           last_active;    /* L: last active timestamp */
24321         unsigned int            flags;          /* X: flags */
24322         int                     id;             /* I: worker id */
24323 +       int                     sleeping;       /* None */
24324
24325         /*
24326          * Opaque string set with work_set_desc().  Printed out with task
24327 @@ -69,7 +70,7 @@ static inline struct worker *current_wq_worker(void)
24328   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
24329   * sched/core.c and workqueue.c.
24330   */
24331 -void wq_worker_waking_up(struct task_struct *task, int cpu);
24332 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
24333 +void wq_worker_running(struct task_struct *task);
24334 +void wq_worker_sleeping(struct task_struct *task);
24335
24336  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
24337 diff --git a/lib/Kconfig b/lib/Kconfig
24338 index 260a80e313b9..b06becb3f477 100644
24339 --- a/lib/Kconfig
24340 +++ b/lib/Kconfig
24341 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
24342
24343  config CPUMASK_OFFSTACK
24344         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
24345 +       depends on !PREEMPT_RT_FULL
24346         help
24347           Use dynamic allocation for cpumask_var_t, instead of putting
24348           them on the stack.  This is a bit more expensive, but avoids
24349 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
24350 index 056052dc8e91..d8494e126de8 100644
24351 --- a/lib/debugobjects.c
24352 +++ b/lib/debugobjects.c
24353 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
24354         struct debug_obj *obj;
24355         unsigned long flags;
24356
24357 -       fill_pool();
24358 +#ifdef CONFIG_PREEMPT_RT_FULL
24359 +       if (preempt_count() == 0 && !irqs_disabled())
24360 +#endif
24361 +               fill_pool();
24362
24363         db = get_bucket((unsigned long) addr);
24364
24365 diff --git a/lib/idr.c b/lib/idr.c
24366 index 6098336df267..9decbe914595 100644
24367 --- a/lib/idr.c
24368 +++ b/lib/idr.c
24369 @@ -30,6 +30,7 @@
24370  #include <linux/idr.h>
24371  #include <linux/spinlock.h>
24372  #include <linux/percpu.h>
24373 +#include <linux/locallock.h>
24374
24375  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
24376  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
24377 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
24378  static DEFINE_PER_CPU(int, idr_preload_cnt);
24379  static DEFINE_SPINLOCK(simple_ida_lock);
24380
24381 +#ifdef CONFIG_PREEMPT_RT_FULL
24382 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
24383 +
24384 +static inline void idr_preload_lock(void)
24385 +{
24386 +       local_lock(idr_lock);
24387 +}
24388 +
24389 +static inline void idr_preload_unlock(void)
24390 +{
24391 +       local_unlock(idr_lock);
24392 +}
24393 +
24394 +void idr_preload_end(void)
24395 +{
24396 +       idr_preload_unlock();
24397 +}
24398 +EXPORT_SYMBOL(idr_preload_end);
24399 +#else
24400 +static inline void idr_preload_lock(void)
24401 +{
24402 +       preempt_disable();
24403 +}
24404 +
24405 +static inline void idr_preload_unlock(void)
24406 +{
24407 +       preempt_enable();
24408 +}
24409 +#endif
24410 +
24411 +
24412  /* the maximum ID which can be allocated given idr->layers */
24413  static int idr_max(int layers)
24414  {
24415 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
24416          * context.  See idr_preload() for details.
24417          */
24418         if (!in_interrupt()) {
24419 -               preempt_disable();
24420 +               idr_preload_lock();
24421                 new = __this_cpu_read(idr_preload_head);
24422                 if (new) {
24423                         __this_cpu_write(idr_preload_head, new->ary[0]);
24424                         __this_cpu_dec(idr_preload_cnt);
24425                         new->ary[0] = NULL;
24426                 }
24427 -               preempt_enable();
24428 +               idr_preload_unlock();
24429                 if (new)
24430                         return new;
24431         }
24432 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
24433         idr_mark_full(pa, id);
24434  }
24435
24436 -
24437  /**
24438   * idr_preload - preload for idr_alloc()
24439   * @gfp_mask: allocation mask to use for preloading
24440 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
24441         WARN_ON_ONCE(in_interrupt());
24442         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
24443
24444 -       preempt_disable();
24445 +       idr_preload_lock();
24446
24447         /*
24448          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
24449 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
24450         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
24451                 struct idr_layer *new;
24452
24453 -               preempt_enable();
24454 +               idr_preload_unlock();
24455                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
24456 -               preempt_disable();
24457 +               idr_preload_lock();
24458                 if (!new)
24459                         break;
24460
24461 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
24462 index 1d6565e81030..b23a79761df7 100644
24463 --- a/lib/irq_poll.c
24464 +++ b/lib/irq_poll.c
24465 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
24466         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
24467         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
24468         local_irq_restore(flags);
24469 +       preempt_check_resched_rt();
24470  }
24471  EXPORT_SYMBOL(irq_poll_sched);
24472
24473 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
24474         local_irq_save(flags);
24475         __irq_poll_complete(iop);
24476         local_irq_restore(flags);
24477 +       preempt_check_resched_rt();
24478  }
24479  EXPORT_SYMBOL(irq_poll_complete);
24480
24481 @@ -95,6 +97,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
24482                 }
24483
24484                 local_irq_enable();
24485 +               preempt_check_resched_rt();
24486
24487                 /* Even though interrupts have been re-enabled, this
24488                  * access is safe because interrupts can only add new
24489 @@ -132,6 +135,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
24490                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
24491
24492         local_irq_enable();
24493 +       preempt_check_resched_rt();
24494  }
24495
24496  /**
24497 @@ -195,6 +199,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
24498                          this_cpu_ptr(&blk_cpu_iopoll));
24499         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
24500         local_irq_enable();
24501 +       preempt_check_resched_rt();
24502
24503         return 0;
24504  }
24505 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
24506 index f3a217ea0388..4611b156ef79 100644
24507 --- a/lib/locking-selftest.c
24508 +++ b/lib/locking-selftest.c
24509 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
24510  #include "locking-selftest-spin-hardirq.h"
24511  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
24512
24513 +#ifndef CONFIG_PREEMPT_RT_FULL
24514 +
24515  #include "locking-selftest-rlock-hardirq.h"
24516  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
24517
24518 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
24519  #include "locking-selftest-wlock-softirq.h"
24520  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
24521
24522 +#endif
24523 +
24524  #undef E1
24525  #undef E2
24526
24527 +#ifndef CONFIG_PREEMPT_RT_FULL
24528  /*
24529   * Enabling hardirqs with a softirq-safe lock held:
24530   */
24531 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
24532  #undef E1
24533  #undef E2
24534
24535 +#endif
24536 +
24537  /*
24538   * Enabling irqs with an irq-safe lock held:
24539   */
24540 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
24541  #include "locking-selftest-spin-hardirq.h"
24542  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
24543
24544 +#ifndef CONFIG_PREEMPT_RT_FULL
24545 +
24546  #include "locking-selftest-rlock-hardirq.h"
24547  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
24548
24549 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
24550  #include "locking-selftest-wlock-softirq.h"
24551  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
24552
24553 +#endif
24554 +
24555  #undef E1
24556  #undef E2
24557
24558 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
24559  #include "locking-selftest-spin-hardirq.h"
24560  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
24561
24562 +#ifndef CONFIG_PREEMPT_RT_FULL
24563 +
24564  #include "locking-selftest-rlock-hardirq.h"
24565  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
24566
24567 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
24568  #include "locking-selftest-wlock-softirq.h"
24569  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
24570
24571 +#endif
24572 +
24573  #undef E1
24574  #undef E2
24575  #undef E3
24576 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
24577  #include "locking-selftest-spin-hardirq.h"
24578  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
24579
24580 +#ifndef CONFIG_PREEMPT_RT_FULL
24581 +
24582  #include "locking-selftest-rlock-hardirq.h"
24583  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
24584
24585 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
24586  #include "locking-selftest-wlock-softirq.h"
24587  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
24588
24589 +#endif
24590 +
24591  #undef E1
24592  #undef E2
24593  #undef E3
24594
24595 +#ifndef CONFIG_PREEMPT_RT_FULL
24596 +
24597  /*
24598   * read-lock / write-lock irq inversion.
24599   *
24600 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
24601  #undef E2
24602  #undef E3
24603
24604 +#endif
24605 +
24606 +#ifndef CONFIG_PREEMPT_RT_FULL
24607 +
24608  /*
24609   * read-lock / write-lock recursion that is actually safe.
24610   */
24611 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
24612  #undef E2
24613  #undef E3
24614
24615 +#endif
24616 +
24617  /*
24618   * read-lock / write-lock recursion that is unsafe.
24619   */
24620 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
24621
24622         printk("  --------------------------------------------------------------------------\n");
24623
24624 +#ifndef CONFIG_PREEMPT_RT_FULL
24625         /*
24626          * irq-context testcases:
24627          */
24628 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
24629
24630         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
24631  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
24632 +#else
24633 +       /* On -rt, we only do hardirq context test for raw spinlock */
24634 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
24635 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
24636 +
24637 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
24638 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
24639 +
24640 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
24641 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
24642 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
24643 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
24644 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
24645 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
24646 +
24647 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
24648 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
24649 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
24650 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
24651 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
24652 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
24653 +#endif
24654
24655         ww_tests();
24656
24657 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
24658 index 6d40944960de..822a2c027e72 100644
24659 --- a/lib/percpu_ida.c
24660 +++ b/lib/percpu_ida.c
24661 @@ -26,6 +26,9 @@
24662  #include <linux/string.h>
24663  #include <linux/spinlock.h>
24664  #include <linux/percpu_ida.h>
24665 +#include <linux/locallock.h>
24666 +
24667 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
24668
24669  struct percpu_ida_cpu {
24670         /*
24671 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24672         unsigned long flags;
24673         int tag;
24674
24675 -       local_irq_save(flags);
24676 +       local_lock_irqsave(irq_off_lock, flags);
24677         tags = this_cpu_ptr(pool->tag_cpu);
24678
24679         /* Fastpath */
24680         tag = alloc_local_tag(tags);
24681         if (likely(tag >= 0)) {
24682 -               local_irq_restore(flags);
24683 +               local_unlock_irqrestore(irq_off_lock, flags);
24684                 return tag;
24685         }
24686
24687 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24688
24689                 if (!tags->nr_free)
24690                         alloc_global_tags(pool, tags);
24691 +
24692                 if (!tags->nr_free)
24693                         steal_tags(pool, tags);
24694
24695 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24696                 }
24697
24698                 spin_unlock(&pool->lock);
24699 -               local_irq_restore(flags);
24700 +               local_unlock_irqrestore(irq_off_lock, flags);
24701
24702                 if (tag >= 0 || state == TASK_RUNNING)
24703                         break;
24704 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24705
24706                 schedule();
24707
24708 -               local_irq_save(flags);
24709 +               local_lock_irqsave(irq_off_lock, flags);
24710                 tags = this_cpu_ptr(pool->tag_cpu);
24711         }
24712         if (state != TASK_RUNNING)
24713 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
24714
24715         BUG_ON(tag >= pool->nr_tags);
24716
24717 -       local_irq_save(flags);
24718 +       local_lock_irqsave(irq_off_lock, flags);
24719         tags = this_cpu_ptr(pool->tag_cpu);
24720
24721         spin_lock(&tags->lock);
24722 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
24723                 spin_unlock(&pool->lock);
24724         }
24725
24726 -       local_irq_restore(flags);
24727 +       local_unlock_irqrestore(irq_off_lock, flags);
24728  }
24729  EXPORT_SYMBOL_GPL(percpu_ida_free);
24730
24731 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
24732         struct percpu_ida_cpu *remote;
24733         unsigned cpu, i, err = 0;
24734
24735 -       local_irq_save(flags);
24736 +       local_lock_irqsave(irq_off_lock, flags);
24737         for_each_possible_cpu(cpu) {
24738                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
24739                 spin_lock(&remote->lock);
24740 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
24741         }
24742         spin_unlock(&pool->lock);
24743  out:
24744 -       local_irq_restore(flags);
24745 +       local_unlock_irqrestore(irq_off_lock, flags);
24746         return err;
24747  }
24748  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
24749 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
24750 index 8e6d552c40dd..741da5a77fd5 100644
24751 --- a/lib/radix-tree.c
24752 +++ b/lib/radix-tree.c
24753 @@ -36,7 +36,7 @@
24754  #include <linux/bitops.h>
24755  #include <linux/rcupdate.h>
24756  #include <linux/preempt.h>             /* in_interrupt() */
24757 -
24758 +#include <linux/locallock.h>
24759
24760  /* Number of nodes in fully populated tree of given height */
24761  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
24762 @@ -68,6 +68,7 @@ struct radix_tree_preload {
24763         struct radix_tree_node *nodes;
24764  };
24765  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
24766 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
24767
24768  static inline void *node_to_entry(void *ptr)
24769  {
24770 @@ -290,13 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
24771                  * succeed in getting a node here (and never reach
24772                  * kmem_cache_alloc)
24773                  */
24774 -               rtp = this_cpu_ptr(&radix_tree_preloads);
24775 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
24776                 if (rtp->nr) {
24777                         ret = rtp->nodes;
24778                         rtp->nodes = ret->private_data;
24779                         ret->private_data = NULL;
24780                         rtp->nr--;
24781                 }
24782 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
24783                 /*
24784                  * Update the allocation stack trace as this is more useful
24785                  * for debugging.
24786 @@ -357,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
24787          */
24788         gfp_mask &= ~__GFP_ACCOUNT;
24789
24790 -       preempt_disable();
24791 +       local_lock(radix_tree_preloads_lock);
24792         rtp = this_cpu_ptr(&radix_tree_preloads);
24793         while (rtp->nr < nr) {
24794 -               preempt_enable();
24795 +               local_unlock(radix_tree_preloads_lock);
24796                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
24797                 if (node == NULL)
24798                         goto out;
24799 -               preempt_disable();
24800 +               local_lock(radix_tree_preloads_lock);
24801                 rtp = this_cpu_ptr(&radix_tree_preloads);
24802                 if (rtp->nr < nr) {
24803                         node->private_data = rtp->nodes;
24804 @@ -406,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
24805         if (gfpflags_allow_blocking(gfp_mask))
24806                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
24807         /* Preloading doesn't help anything with this gfp mask, skip it */
24808 -       preempt_disable();
24809 +       local_lock(radix_tree_preloads_lock);
24810         return 0;
24811  }
24812  EXPORT_SYMBOL(radix_tree_maybe_preload);
24813 @@ -422,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
24814
24815         /* Preloading doesn't help anything with this gfp mask, skip it */
24816         if (!gfpflags_allow_blocking(gfp_mask)) {
24817 -               preempt_disable();
24818 +               local_lock(radix_tree_preloads_lock);
24819                 return 0;
24820         }
24821
24822 @@ -456,6 +458,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
24823         return __radix_tree_preload(gfp_mask, nr_nodes);
24824  }
24825
24826 +void radix_tree_preload_end(void)
24827 +{
24828 +       local_unlock(radix_tree_preloads_lock);
24829 +}
24830 +EXPORT_SYMBOL(radix_tree_preload_end);
24831 +
24832  /*
24833   * The maximum index which can be stored in a radix tree
24834   */
24835 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
24836 index 004fc70fc56a..ccc46992a517 100644
24837 --- a/lib/scatterlist.c
24838 +++ b/lib/scatterlist.c
24839 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
24840                         flush_kernel_dcache_page(miter->page);
24841
24842                 if (miter->__flags & SG_MITER_ATOMIC) {
24843 -                       WARN_ON_ONCE(preemptible());
24844 +                       WARN_ON_ONCE(!pagefault_disabled());
24845                         kunmap_atomic(miter->addr);
24846                 } else
24847                         kunmap(miter->page);
24848 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
24849         if (!sg_miter_skip(&miter, skip))
24850                 return false;
24851
24852 -       local_irq_save(flags);
24853 +       local_irq_save_nort(flags);
24854
24855         while (sg_miter_next(&miter) && offset < buflen) {
24856                 unsigned int len;
24857 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
24858
24859         sg_miter_stop(&miter);
24860
24861 -       local_irq_restore(flags);
24862 +       local_irq_restore_nort(flags);
24863         return offset;
24864  }
24865  EXPORT_SYMBOL(sg_copy_buffer);
24866 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
24867 index 1afec32de6f2..11fa431046a8 100644
24868 --- a/lib/smp_processor_id.c
24869 +++ b/lib/smp_processor_id.c
24870 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
24871         if (!printk_ratelimit())
24872                 goto out_enable;
24873
24874 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
24875 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
24876 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
24877 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
24878 +               current->comm, current->pid);
24879
24880         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
24881         dump_stack();
24882 diff --git a/localversion-rt b/localversion-rt
24883 new file mode 100644
24884 index 000000000000..66fa05e70f29
24885 --- /dev/null
24886 +++ b/localversion-rt
24887 @@ -0,0 +1 @@
24888 +-rt60
24889 diff --git a/mm/Kconfig b/mm/Kconfig
24890 index 86e3e0e74d20..77e5862a1ed2 100644
24891 --- a/mm/Kconfig
24892 +++ b/mm/Kconfig
24893 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
24894
24895  config TRANSPARENT_HUGEPAGE
24896         bool "Transparent Hugepage Support"
24897 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
24898 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
24899         select COMPACTION
24900         select RADIX_TREE_MULTIORDER
24901         help
24902 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
24903 index 6ff2d7744223..b5a91dd53b5f 100644
24904 --- a/mm/backing-dev.c
24905 +++ b/mm/backing-dev.c
24906 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
24907  {
24908         unsigned long flags;
24909
24910 -       local_irq_save(flags);
24911 +       local_irq_save_nort(flags);
24912         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
24913 -               local_irq_restore(flags);
24914 +               local_irq_restore_nort(flags);
24915                 return;
24916         }
24917
24918 diff --git a/mm/compaction.c b/mm/compaction.c
24919 index 70e6bec46dc2..6678ed58b7c6 100644
24920 --- a/mm/compaction.c
24921 +++ b/mm/compaction.c
24922 @@ -1593,10 +1593,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
24923                                 block_start_pfn(cc->migrate_pfn, cc->order);
24924
24925                         if (cc->last_migrated_pfn < current_block_start) {
24926 -                               cpu = get_cpu();
24927 +                               cpu = get_cpu_light();
24928 +                               local_lock_irq(swapvec_lock);
24929                                 lru_add_drain_cpu(cpu);
24930 +                               local_unlock_irq(swapvec_lock);
24931                                 drain_local_pages(zone);
24932 -                               put_cpu();
24933 +                               put_cpu_light();
24934                                 /* No more flushing until we migrate again */
24935                                 cc->last_migrated_pfn = 0;
24936                         }
24937 diff --git a/mm/filemap.c b/mm/filemap.c
24938 index edfb90e3830c..a8d2c7a73d54 100644
24939 --- a/mm/filemap.c
24940 +++ b/mm/filemap.c
24941 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
24942                  * node->private_list is protected by
24943                  * mapping->tree_lock.
24944                  */
24945 -               if (!list_empty(&node->private_list))
24946 -                       list_lru_del(&workingset_shadow_nodes,
24947 +               if (!list_empty(&node->private_list)) {
24948 +                       local_lock(workingset_shadow_lock);
24949 +                       list_lru_del(&__workingset_shadow_nodes,
24950                                      &node->private_list);
24951 +                       local_unlock(workingset_shadow_lock);
24952 +               }
24953         }
24954         return 0;
24955  }
24956 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
24957                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
24958                                 list_empty(&node->private_list)) {
24959                         node->private_data = mapping;
24960 -                       list_lru_add(&workingset_shadow_nodes,
24961 -                                       &node->private_list);
24962 +                       local_lock(workingset_shadow_lock);
24963 +                       list_lru_add(&__workingset_shadow_nodes,
24964 +                                    &node->private_list);
24965 +                       local_unlock(workingset_shadow_lock);
24966                 }
24967         }
24968
24969 diff --git a/mm/highmem.c b/mm/highmem.c
24970 index 50b4ca6787f0..77518a3b35a1 100644
24971 --- a/mm/highmem.c
24972 +++ b/mm/highmem.c
24973 @@ -29,10 +29,11 @@
24974  #include <linux/kgdb.h>
24975  #include <asm/tlbflush.h>
24976
24977 -
24978 +#ifndef CONFIG_PREEMPT_RT_FULL
24979  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
24980  DEFINE_PER_CPU(int, __kmap_atomic_idx);
24981  #endif
24982 +#endif
24983
24984  /*
24985   * Virtual_count is not a pure "count".
24986 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
24987  unsigned long totalhigh_pages __read_mostly;
24988  EXPORT_SYMBOL(totalhigh_pages);
24989
24990 -
24991 +#ifndef CONFIG_PREEMPT_RT_FULL
24992  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
24993 +#endif
24994
24995  unsigned int nr_free_highpages (void)
24996  {
24997 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
24998 index 2a800c4a39bd..c04403033aec 100644
24999 --- a/mm/memcontrol.c
25000 +++ b/mm/memcontrol.c
25001 @@ -67,6 +67,7 @@
25002  #include <net/sock.h>
25003  #include <net/ip.h>
25004  #include "slab.h"
25005 +#include <linux/locallock.h>
25006
25007  #include <asm/uaccess.h>
25008
25009 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
25010  #define do_swap_account                0
25011  #endif
25012
25013 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
25014 +
25015  /* Whether legacy memory+swap accounting is active */
25016  static bool do_memsw_account(void)
25017  {
25018 @@ -1795,7 +1798,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
25019                 return;
25020         /* Notify other cpus that system-wide "drain" is running */
25021         get_online_cpus();
25022 -       curcpu = get_cpu();
25023 +       curcpu = get_cpu_light();
25024         for_each_online_cpu(cpu) {
25025                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
25026                 struct mem_cgroup *memcg;
25027 @@ -1812,7 +1815,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
25028                                 schedule_work_on(cpu, &stock->work);
25029                 }
25030         }
25031 -       put_cpu();
25032 +       put_cpu_light();
25033         put_online_cpus();
25034         mutex_unlock(&percpu_charge_mutex);
25035  }
25036 @@ -4558,12 +4561,12 @@ static int mem_cgroup_move_account(struct page *page,
25037
25038         ret = 0;
25039
25040 -       local_irq_disable();
25041 +       local_lock_irq(event_lock);
25042         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
25043         memcg_check_events(to, page);
25044         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
25045         memcg_check_events(from, page);
25046 -       local_irq_enable();
25047 +       local_unlock_irq(event_lock);
25048  out_unlock:
25049         unlock_page(page);
25050  out:
25051 @@ -5438,10 +5441,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
25052
25053         commit_charge(page, memcg, lrucare);
25054
25055 -       local_irq_disable();
25056 +       local_lock_irq(event_lock);
25057         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
25058         memcg_check_events(memcg, page);
25059 -       local_irq_enable();
25060 +       local_unlock_irq(event_lock);
25061
25062         if (do_memsw_account() && PageSwapCache(page)) {
25063                 swp_entry_t entry = { .val = page_private(page) };
25064 @@ -5497,14 +5500,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
25065                 memcg_oom_recover(memcg);
25066         }
25067
25068 -       local_irq_save(flags);
25069 +       local_lock_irqsave(event_lock, flags);
25070         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
25071         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
25072         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
25073         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
25074         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
25075         memcg_check_events(memcg, dummy_page);
25076 -       local_irq_restore(flags);
25077 +       local_unlock_irqrestore(event_lock, flags);
25078
25079         if (!mem_cgroup_is_root(memcg))
25080                 css_put_many(&memcg->css, nr_pages);
25081 @@ -5659,10 +5662,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
25082
25083         commit_charge(newpage, memcg, false);
25084
25085 -       local_irq_save(flags);
25086 +       local_lock_irqsave(event_lock, flags);
25087         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
25088         memcg_check_events(memcg, newpage);
25089 -       local_irq_restore(flags);
25090 +       local_unlock_irqrestore(event_lock, flags);
25091  }
25092
25093  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
25094 @@ -5853,6 +5856,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
25095  {
25096         struct mem_cgroup *memcg, *swap_memcg;
25097         unsigned short oldid;
25098 +       unsigned long flags;
25099
25100         VM_BUG_ON_PAGE(PageLRU(page), page);
25101         VM_BUG_ON_PAGE(page_count(page), page);
25102 @@ -5893,12 +5897,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
25103          * important here to have the interrupts disabled because it is the
25104          * only synchronisation we have for udpating the per-CPU variables.
25105          */
25106 +       local_lock_irqsave(event_lock, flags);
25107 +#ifndef CONFIG_PREEMPT_RT_BASE
25108         VM_BUG_ON(!irqs_disabled());
25109 +#endif
25110         mem_cgroup_charge_statistics(memcg, page, false, -1);
25111         memcg_check_events(memcg, page);
25112
25113         if (!mem_cgroup_is_root(memcg))
25114                 css_put(&memcg->css);
25115 +       local_unlock_irqrestore(event_lock, flags);
25116  }
25117
25118  /*
25119 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
25120 index 6f4d27c5bb32..5cd25c745a8f 100644
25121 --- a/mm/mmu_context.c
25122 +++ b/mm/mmu_context.c
25123 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
25124         struct task_struct *tsk = current;
25125
25126         task_lock(tsk);
25127 +       preempt_disable_rt();
25128         active_mm = tsk->active_mm;
25129         if (active_mm != mm) {
25130                 atomic_inc(&mm->mm_count);
25131 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
25132         }
25133         tsk->mm = mm;
25134         switch_mm(active_mm, mm, tsk);
25135 +       preempt_enable_rt();
25136         task_unlock(tsk);
25137  #ifdef finish_arch_post_lock_switch
25138         finish_arch_post_lock_switch();
25139 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
25140 index fbc38888252b..1cb08e1406ea 100644
25141 --- a/mm/page_alloc.c
25142 +++ b/mm/page_alloc.c
25143 @@ -61,6 +61,7 @@
25144  #include <linux/page_ext.h>
25145  #include <linux/hugetlb.h>
25146  #include <linux/sched/rt.h>
25147 +#include <linux/locallock.h>
25148  #include <linux/page_owner.h>
25149  #include <linux/kthread.h>
25150  #include <linux/memcontrol.h>
25151 @@ -281,6 +282,18 @@ EXPORT_SYMBOL(nr_node_ids);
25152  EXPORT_SYMBOL(nr_online_nodes);
25153  #endif
25154
25155 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
25156 +
25157 +#ifdef CONFIG_PREEMPT_RT_BASE
25158 +# define cpu_lock_irqsave(cpu, flags)          \
25159 +       local_lock_irqsave_on(pa_lock, flags, cpu)
25160 +# define cpu_unlock_irqrestore(cpu, flags)     \
25161 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
25162 +#else
25163 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
25164 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
25165 +#endif
25166 +
25167  int page_group_by_mobility_disabled __read_mostly;
25168
25169  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
25170 @@ -1092,7 +1105,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
25171  #endif /* CONFIG_DEBUG_VM */
25172
25173  /*
25174 - * Frees a number of pages from the PCP lists
25175 + * Frees a number of pages which have been collected from the pcp lists.
25176   * Assumes all pages on list are in same zone, and of same order.
25177   * count is the number of pages to free.
25178   *
25179 @@ -1103,19 +1116,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
25180   * pinned" detection logic.
25181   */
25182  static void free_pcppages_bulk(struct zone *zone, int count,
25183 -                                       struct per_cpu_pages *pcp)
25184 +                              struct list_head *list)
25185  {
25186 -       int migratetype = 0;
25187 -       int batch_free = 0;
25188         unsigned long nr_scanned;
25189         bool isolated_pageblocks;
25190 +       unsigned long flags;
25191 +
25192 +       spin_lock_irqsave(&zone->lock, flags);
25193
25194 -       spin_lock(&zone->lock);
25195         isolated_pageblocks = has_isolate_pageblock(zone);
25196         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
25197         if (nr_scanned)
25198                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
25199
25200 +       while (!list_empty(list)) {
25201 +               struct page *page;
25202 +               int mt; /* migratetype of the to-be-freed page */
25203 +
25204 +               page = list_first_entry(list, struct page, lru);
25205 +               /* must delete as __free_one_page list manipulates */
25206 +               list_del(&page->lru);
25207 +
25208 +               mt = get_pcppage_migratetype(page);
25209 +               /* MIGRATE_ISOLATE page should not go to pcplists */
25210 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
25211 +               /* Pageblock could have been isolated meanwhile */
25212 +               if (unlikely(isolated_pageblocks))
25213 +                       mt = get_pageblock_migratetype(page);
25214 +
25215 +               if (bulkfree_pcp_prepare(page))
25216 +                       continue;
25217 +
25218 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
25219 +               trace_mm_page_pcpu_drain(page, 0, mt);
25220 +               count--;
25221 +       }
25222 +       WARN_ON(count != 0);
25223 +       spin_unlock_irqrestore(&zone->lock, flags);
25224 +}
25225 +
25226 +/*
25227 + * Moves a number of pages from the PCP lists to free list which
25228 + * is freed outside of the locked region.
25229 + *
25230 + * Assumes all pages on list are in same zone, and of same order.
25231 + * count is the number of pages to free.
25232 + */
25233 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
25234 +                             struct list_head *dst)
25235 +{
25236 +       int migratetype = 0;
25237 +       int batch_free = 0;
25238 +
25239         while (count) {
25240                 struct page *page;
25241                 struct list_head *list;
25242 @@ -1131,7 +1183,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
25243                         batch_free++;
25244                         if (++migratetype == MIGRATE_PCPTYPES)
25245                                 migratetype = 0;
25246 -                       list = &pcp->lists[migratetype];
25247 +                       list = &src->lists[migratetype];
25248                 } while (list_empty(list));
25249
25250                 /* This is the only non-empty list. Free them all. */
25251 @@ -1139,27 +1191,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
25252                         batch_free = count;
25253
25254                 do {
25255 -                       int mt; /* migratetype of the to-be-freed page */
25256 -
25257                         page = list_last_entry(list, struct page, lru);
25258 -                       /* must delete as __free_one_page list manipulates */
25259                         list_del(&page->lru);
25260
25261 -                       mt = get_pcppage_migratetype(page);
25262 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
25263 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
25264 -                       /* Pageblock could have been isolated meanwhile */
25265 -                       if (unlikely(isolated_pageblocks))
25266 -                               mt = get_pageblock_migratetype(page);
25267 -
25268 -                       if (bulkfree_pcp_prepare(page))
25269 -                               continue;
25270 -
25271 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
25272 -                       trace_mm_page_pcpu_drain(page, 0, mt);
25273 +                       list_add(&page->lru, dst);
25274                 } while (--count && --batch_free && !list_empty(list));
25275         }
25276 -       spin_unlock(&zone->lock);
25277  }
25278
25279  static void free_one_page(struct zone *zone,
25280 @@ -1168,7 +1205,9 @@ static void free_one_page(struct zone *zone,
25281                                 int migratetype)
25282  {
25283         unsigned long nr_scanned;
25284 -       spin_lock(&zone->lock);
25285 +       unsigned long flags;
25286 +
25287 +       spin_lock_irqsave(&zone->lock, flags);
25288         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
25289         if (nr_scanned)
25290                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
25291 @@ -1178,7 +1217,7 @@ static void free_one_page(struct zone *zone,
25292                 migratetype = get_pfnblock_migratetype(page, pfn);
25293         }
25294         __free_one_page(page, pfn, zone, order, migratetype);
25295 -       spin_unlock(&zone->lock);
25296 +       spin_unlock_irqrestore(&zone->lock, flags);
25297  }
25298
25299  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
25300 @@ -1264,10 +1303,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
25301                 return;
25302
25303         migratetype = get_pfnblock_migratetype(page, pfn);
25304 -       local_irq_save(flags);
25305 +       local_lock_irqsave(pa_lock, flags);
25306         __count_vm_events(PGFREE, 1 << order);
25307         free_one_page(page_zone(page), page, pfn, order, migratetype);
25308 -       local_irq_restore(flags);
25309 +       local_unlock_irqrestore(pa_lock, flags);
25310  }
25311
25312  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
25313 @@ -2282,16 +2321,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
25314  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
25315  {
25316         unsigned long flags;
25317 +       LIST_HEAD(dst);
25318         int to_drain, batch;
25319
25320 -       local_irq_save(flags);
25321 +       local_lock_irqsave(pa_lock, flags);
25322         batch = READ_ONCE(pcp->batch);
25323         to_drain = min(pcp->count, batch);
25324         if (to_drain > 0) {
25325 -               free_pcppages_bulk(zone, to_drain, pcp);
25326 +               isolate_pcp_pages(to_drain, pcp, &dst);
25327                 pcp->count -= to_drain;
25328         }
25329 -       local_irq_restore(flags);
25330 +       local_unlock_irqrestore(pa_lock, flags);
25331 +       free_pcppages_bulk(zone, to_drain, &dst);
25332  }
25333  #endif
25334
25335 @@ -2307,16 +2348,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
25336         unsigned long flags;
25337         struct per_cpu_pageset *pset;
25338         struct per_cpu_pages *pcp;
25339 +       LIST_HEAD(dst);
25340 +       int count;
25341
25342 -       local_irq_save(flags);
25343 +       cpu_lock_irqsave(cpu, flags);
25344         pset = per_cpu_ptr(zone->pageset, cpu);
25345
25346         pcp = &pset->pcp;
25347 -       if (pcp->count) {
25348 -               free_pcppages_bulk(zone, pcp->count, pcp);
25349 +       count = pcp->count;
25350 +       if (count) {
25351 +               isolate_pcp_pages(count, pcp, &dst);
25352                 pcp->count = 0;
25353         }
25354 -       local_irq_restore(flags);
25355 +       cpu_unlock_irqrestore(cpu, flags);
25356 +       if (count)
25357 +               free_pcppages_bulk(zone, count, &dst);
25358  }
25359
25360  /*
25361 @@ -2402,8 +2448,17 @@ void drain_all_pages(struct zone *zone)
25362                 else
25363                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
25364         }
25365 +#ifndef CONFIG_PREEMPT_RT_BASE
25366         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
25367                                                                 zone, 1);
25368 +#else
25369 +       for_each_cpu(cpu, &cpus_with_pcps) {
25370 +               if (zone)
25371 +                       drain_pages_zone(cpu, zone);
25372 +               else
25373 +                       drain_pages(cpu);
25374 +       }
25375 +#endif
25376  }
25377
25378  #ifdef CONFIG_HIBERNATION
25379 @@ -2463,7 +2518,7 @@ void free_hot_cold_page(struct page *page, bool cold)
25380
25381         migratetype = get_pfnblock_migratetype(page, pfn);
25382         set_pcppage_migratetype(page, migratetype);
25383 -       local_irq_save(flags);
25384 +       local_lock_irqsave(pa_lock, flags);
25385         __count_vm_event(PGFREE);
25386
25387         /*
25388 @@ -2489,12 +2544,17 @@ void free_hot_cold_page(struct page *page, bool cold)
25389         pcp->count++;
25390         if (pcp->count >= pcp->high) {
25391                 unsigned long batch = READ_ONCE(pcp->batch);
25392 -               free_pcppages_bulk(zone, batch, pcp);
25393 +               LIST_HEAD(dst);
25394 +
25395 +               isolate_pcp_pages(batch, pcp, &dst);
25396                 pcp->count -= batch;
25397 +               local_unlock_irqrestore(pa_lock, flags);
25398 +               free_pcppages_bulk(zone, batch, &dst);
25399 +               return;
25400         }
25401
25402  out:
25403 -       local_irq_restore(flags);
25404 +       local_unlock_irqrestore(pa_lock, flags);
25405  }
25406
25407  /*
25408 @@ -2629,7 +2689,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
25409                 struct per_cpu_pages *pcp;
25410                 struct list_head *list;
25411
25412 -               local_irq_save(flags);
25413 +               local_lock_irqsave(pa_lock, flags);
25414                 do {
25415                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
25416                         list = &pcp->lists[migratetype];
25417 @@ -2656,7 +2716,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
25418                  * allocate greater than order-1 page units with __GFP_NOFAIL.
25419                  */
25420                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
25421 -               spin_lock_irqsave(&zone->lock, flags);
25422 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
25423
25424                 do {
25425                         page = NULL;
25426 @@ -2668,22 +2728,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
25427                         if (!page)
25428                                 page = __rmqueue(zone, order, migratetype);
25429                 } while (page && check_new_pages(page, order));
25430 -               spin_unlock(&zone->lock);
25431 -               if (!page)
25432 +               if (!page) {
25433 +                       spin_unlock(&zone->lock);
25434                         goto failed;
25435 +               }
25436                 __mod_zone_freepage_state(zone, -(1 << order),
25437                                           get_pcppage_migratetype(page));
25438 +               spin_unlock(&zone->lock);
25439         }
25440
25441         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
25442         zone_statistics(preferred_zone, zone, gfp_flags);
25443 -       local_irq_restore(flags);
25444 +       local_unlock_irqrestore(pa_lock, flags);
25445
25446         VM_BUG_ON_PAGE(bad_range(zone, page), page);
25447         return page;
25448
25449  failed:
25450 -       local_irq_restore(flags);
25451 +       local_unlock_irqrestore(pa_lock, flags);
25452         return NULL;
25453  }
25454
25455 @@ -6561,7 +6623,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
25456         int cpu = (unsigned long)hcpu;
25457
25458         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
25459 +               local_lock_irq_on(swapvec_lock, cpu);
25460                 lru_add_drain_cpu(cpu);
25461 +               local_unlock_irq_on(swapvec_lock, cpu);
25462                 drain_pages(cpu);
25463
25464                 /*
25465 @@ -6587,6 +6651,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
25466  void __init page_alloc_init(void)
25467  {
25468         hotcpu_notifier(page_alloc_cpu_notify, 0);
25469 +       local_irq_lock_init(pa_lock);
25470  }
25471
25472  /*
25473 @@ -7422,7 +7487,7 @@ void zone_pcp_reset(struct zone *zone)
25474         struct per_cpu_pageset *pset;
25475
25476         /* avoid races with drain_pages()  */
25477 -       local_irq_save(flags);
25478 +       local_lock_irqsave(pa_lock, flags);
25479         if (zone->pageset != &boot_pageset) {
25480                 for_each_online_cpu(cpu) {
25481                         pset = per_cpu_ptr(zone->pageset, cpu);
25482 @@ -7431,7 +7496,7 @@ void zone_pcp_reset(struct zone *zone)
25483                 free_percpu(zone->pageset);
25484                 zone->pageset = &boot_pageset;
25485         }
25486 -       local_irq_restore(flags);
25487 +       local_unlock_irqrestore(pa_lock, flags);
25488  }
25489
25490  #ifdef CONFIG_MEMORY_HOTREMOVE
25491 diff --git a/mm/percpu.c b/mm/percpu.c
25492 index f014cebbf405..4e739fcf91bf 100644
25493 --- a/mm/percpu.c
25494 +++ b/mm/percpu.c
25495 @@ -1283,18 +1283,7 @@ void free_percpu(void __percpu *ptr)
25496  }
25497  EXPORT_SYMBOL_GPL(free_percpu);
25498
25499 -/**
25500 - * is_kernel_percpu_address - test whether address is from static percpu area
25501 - * @addr: address to test
25502 - *
25503 - * Test whether @addr belongs to in-kernel static percpu area.  Module
25504 - * static percpu areas are not considered.  For those, use
25505 - * is_module_percpu_address().
25506 - *
25507 - * RETURNS:
25508 - * %true if @addr is from in-kernel static percpu area, %false otherwise.
25509 - */
25510 -bool is_kernel_percpu_address(unsigned long addr)
25511 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
25512  {
25513  #ifdef CONFIG_SMP
25514         const size_t static_size = __per_cpu_end - __per_cpu_start;
25515 @@ -1303,16 +1292,39 @@ bool is_kernel_percpu_address(unsigned long addr)
25516
25517         for_each_possible_cpu(cpu) {
25518                 void *start = per_cpu_ptr(base, cpu);
25519 +               void *va = (void *)addr;
25520
25521 -               if ((void *)addr >= start && (void *)addr < start + static_size)
25522 +               if (va >= start && va < start + static_size) {
25523 +                       if (can_addr) {
25524 +                               *can_addr = (unsigned long) (va - start);
25525 +                               *can_addr += (unsigned long)
25526 +                                       per_cpu_ptr(base, get_boot_cpu_id());
25527 +                       }
25528                         return true;
25529 -        }
25530 +               }
25531 +       }
25532  #endif
25533         /* on UP, can't distinguish from other static vars, always false */
25534         return false;
25535  }
25536
25537  /**
25538 + * is_kernel_percpu_address - test whether address is from static percpu area
25539 + * @addr: address to test
25540 + *
25541 + * Test whether @addr belongs to in-kernel static percpu area.  Module
25542 + * static percpu areas are not considered.  For those, use
25543 + * is_module_percpu_address().
25544 + *
25545 + * RETURNS:
25546 + * %true if @addr is from in-kernel static percpu area, %false otherwise.
25547 + */
25548 +bool is_kernel_percpu_address(unsigned long addr)
25549 +{
25550 +       return __is_kernel_percpu_address(addr, NULL);
25551 +}
25552 +
25553 +/**
25554   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
25555   * @addr: the address to be converted to physical address
25556   *
25557 diff --git a/mm/slab.h b/mm/slab.h
25558 index ceb7d70cdb76..dfd281e43fbe 100644
25559 --- a/mm/slab.h
25560 +++ b/mm/slab.h
25561 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
25562   * The slab lists for all objects.
25563   */
25564  struct kmem_cache_node {
25565 +#ifdef CONFIG_SLUB
25566 +       raw_spinlock_t list_lock;
25567 +#else
25568         spinlock_t list_lock;
25569 +#endif
25570
25571  #ifdef CONFIG_SLAB
25572         struct list_head slabs_partial; /* partial list first, better asm code */
25573 diff --git a/mm/slub.c b/mm/slub.c
25574 index edc79ca3c6d5..67eb368b9314 100644
25575 --- a/mm/slub.c
25576 +++ b/mm/slub.c
25577 @@ -1144,7 +1144,7 @@ static noinline int free_debug_processing(
25578         unsigned long uninitialized_var(flags);
25579         int ret = 0;
25580
25581 -       spin_lock_irqsave(&n->list_lock, flags);
25582 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25583         slab_lock(page);
25584
25585         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
25586 @@ -1179,7 +1179,7 @@ static noinline int free_debug_processing(
25587                          bulk_cnt, cnt);
25588
25589         slab_unlock(page);
25590 -       spin_unlock_irqrestore(&n->list_lock, flags);
25591 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25592         if (!ret)
25593                 slab_fix(s, "Object at 0x%p not freed", object);
25594         return ret;
25595 @@ -1307,6 +1307,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
25596
25597  #endif /* CONFIG_SLUB_DEBUG */
25598
25599 +struct slub_free_list {
25600 +       raw_spinlock_t          lock;
25601 +       struct list_head        list;
25602 +};
25603 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
25604 +
25605  /*
25606   * Hooks for other subsystems that check memory allocations. In a typical
25607   * production configuration these hooks all should produce no code at all.
25608 @@ -1530,10 +1536,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
25609         void *start, *p;
25610         int idx, order;
25611         bool shuffle;
25612 +       bool enableirqs = false;
25613
25614         flags &= gfp_allowed_mask;
25615
25616         if (gfpflags_allow_blocking(flags))
25617 +               enableirqs = true;
25618 +#ifdef CONFIG_PREEMPT_RT_FULL
25619 +       if (system_state == SYSTEM_RUNNING)
25620 +               enableirqs = true;
25621 +#endif
25622 +       if (enableirqs)
25623                 local_irq_enable();
25624
25625         flags |= s->allocflags;
25626 @@ -1608,7 +1621,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
25627         page->frozen = 1;
25628
25629  out:
25630 -       if (gfpflags_allow_blocking(flags))
25631 +       if (enableirqs)
25632                 local_irq_disable();
25633         if (!page)
25634                 return NULL;
25635 @@ -1667,6 +1680,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
25636         __free_pages(page, order);
25637  }
25638
25639 +static void free_delayed(struct list_head *h)
25640 +{
25641 +       while(!list_empty(h)) {
25642 +               struct page *page = list_first_entry(h, struct page, lru);
25643 +
25644 +               list_del(&page->lru);
25645 +               __free_slab(page->slab_cache, page);
25646 +       }
25647 +}
25648 +
25649  #define need_reserve_slab_rcu                                          \
25650         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
25651
25652 @@ -1698,6 +1721,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
25653                 }
25654
25655                 call_rcu(head, rcu_free_slab);
25656 +       } else if (irqs_disabled()) {
25657 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
25658 +
25659 +               raw_spin_lock(&f->lock);
25660 +               list_add(&page->lru, &f->list);
25661 +               raw_spin_unlock(&f->lock);
25662         } else
25663                 __free_slab(s, page);
25664  }
25665 @@ -1805,7 +1834,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
25666         if (!n || !n->nr_partial)
25667                 return NULL;
25668
25669 -       spin_lock(&n->list_lock);
25670 +       raw_spin_lock(&n->list_lock);
25671         list_for_each_entry_safe(page, page2, &n->partial, lru) {
25672                 void *t;
25673
25674 @@ -1830,7 +1859,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
25675                         break;
25676
25677         }
25678 -       spin_unlock(&n->list_lock);
25679 +       raw_spin_unlock(&n->list_lock);
25680         return object;
25681  }
25682
25683 @@ -2076,7 +2105,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25684                          * that acquire_slab() will see a slab page that
25685                          * is frozen
25686                          */
25687 -                       spin_lock(&n->list_lock);
25688 +                       raw_spin_lock(&n->list_lock);
25689                 }
25690         } else {
25691                 m = M_FULL;
25692 @@ -2087,7 +2116,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25693                          * slabs from diagnostic functions will not see
25694                          * any frozen slabs.
25695                          */
25696 -                       spin_lock(&n->list_lock);
25697 +                       raw_spin_lock(&n->list_lock);
25698                 }
25699         }
25700
25701 @@ -2122,7 +2151,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25702                 goto redo;
25703
25704         if (lock)
25705 -               spin_unlock(&n->list_lock);
25706 +               raw_spin_unlock(&n->list_lock);
25707
25708         if (m == M_FREE) {
25709                 stat(s, DEACTIVATE_EMPTY);
25710 @@ -2154,10 +2183,10 @@ static void unfreeze_partials(struct kmem_cache *s,
25711                 n2 = get_node(s, page_to_nid(page));
25712                 if (n != n2) {
25713                         if (n)
25714 -                               spin_unlock(&n->list_lock);
25715 +                               raw_spin_unlock(&n->list_lock);
25716
25717                         n = n2;
25718 -                       spin_lock(&n->list_lock);
25719 +                       raw_spin_lock(&n->list_lock);
25720                 }
25721
25722                 do {
25723 @@ -2186,7 +2215,7 @@ static void unfreeze_partials(struct kmem_cache *s,
25724         }
25725
25726         if (n)
25727 -               spin_unlock(&n->list_lock);
25728 +               raw_spin_unlock(&n->list_lock);
25729
25730         while (discard_page) {
25731                 page = discard_page;
25732 @@ -2225,14 +2254,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
25733                         pobjects = oldpage->pobjects;
25734                         pages = oldpage->pages;
25735                         if (drain && pobjects > s->cpu_partial) {
25736 +                               struct slub_free_list *f;
25737                                 unsigned long flags;
25738 +                               LIST_HEAD(tofree);
25739                                 /*
25740                                  * partial array is full. Move the existing
25741                                  * set to the per node partial list.
25742                                  */
25743                                 local_irq_save(flags);
25744                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
25745 +                               f = this_cpu_ptr(&slub_free_list);
25746 +                               raw_spin_lock(&f->lock);
25747 +                               list_splice_init(&f->list, &tofree);
25748 +                               raw_spin_unlock(&f->lock);
25749                                 local_irq_restore(flags);
25750 +                               free_delayed(&tofree);
25751                                 oldpage = NULL;
25752                                 pobjects = 0;
25753                                 pages = 0;
25754 @@ -2304,7 +2340,22 @@ static bool has_cpu_slab(int cpu, void *info)
25755
25756  static void flush_all(struct kmem_cache *s)
25757  {
25758 +       LIST_HEAD(tofree);
25759 +       int cpu;
25760 +
25761         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
25762 +       for_each_online_cpu(cpu) {
25763 +               struct slub_free_list *f;
25764 +
25765 +               if (!has_cpu_slab(cpu, s))
25766 +                       continue;
25767 +
25768 +               f = &per_cpu(slub_free_list, cpu);
25769 +               raw_spin_lock_irq(&f->lock);
25770 +               list_splice_init(&f->list, &tofree);
25771 +               raw_spin_unlock_irq(&f->lock);
25772 +               free_delayed(&tofree);
25773 +       }
25774  }
25775
25776  /*
25777 @@ -2359,10 +2410,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
25778         unsigned long x = 0;
25779         struct page *page;
25780
25781 -       spin_lock_irqsave(&n->list_lock, flags);
25782 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25783         list_for_each_entry(page, &n->partial, lru)
25784                 x += get_count(page);
25785 -       spin_unlock_irqrestore(&n->list_lock, flags);
25786 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25787         return x;
25788  }
25789  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
25790 @@ -2500,8 +2551,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
25791   * already disabled (which is the case for bulk allocation).
25792   */
25793  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25794 -                         unsigned long addr, struct kmem_cache_cpu *c)
25795 +                         unsigned long addr, struct kmem_cache_cpu *c,
25796 +                         struct list_head *to_free)
25797  {
25798 +       struct slub_free_list *f;
25799         void *freelist;
25800         struct page *page;
25801
25802 @@ -2561,6 +2614,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25803         VM_BUG_ON(!c->page->frozen);
25804         c->freelist = get_freepointer(s, freelist);
25805         c->tid = next_tid(c->tid);
25806 +
25807 +out:
25808 +       f = this_cpu_ptr(&slub_free_list);
25809 +       raw_spin_lock(&f->lock);
25810 +       list_splice_init(&f->list, to_free);
25811 +       raw_spin_unlock(&f->lock);
25812 +
25813         return freelist;
25814
25815  new_slab:
25816 @@ -2592,7 +2652,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25817         deactivate_slab(s, page, get_freepointer(s, freelist));
25818         c->page = NULL;
25819         c->freelist = NULL;
25820 -       return freelist;
25821 +       goto out;
25822  }
25823
25824  /*
25825 @@ -2604,6 +2664,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25826  {
25827         void *p;
25828         unsigned long flags;
25829 +       LIST_HEAD(tofree);
25830
25831         local_irq_save(flags);
25832  #ifdef CONFIG_PREEMPT
25833 @@ -2615,8 +2676,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25834         c = this_cpu_ptr(s->cpu_slab);
25835  #endif
25836
25837 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
25838 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
25839         local_irq_restore(flags);
25840 +       free_delayed(&tofree);
25841         return p;
25842  }
25843
25844 @@ -2802,7 +2864,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25845
25846         do {
25847                 if (unlikely(n)) {
25848 -                       spin_unlock_irqrestore(&n->list_lock, flags);
25849 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25850                         n = NULL;
25851                 }
25852                 prior = page->freelist;
25853 @@ -2834,7 +2896,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25854                                  * Otherwise the list_lock will synchronize with
25855                                  * other processors updating the list of slabs.
25856                                  */
25857 -                               spin_lock_irqsave(&n->list_lock, flags);
25858 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
25859
25860                         }
25861                 }
25862 @@ -2876,7 +2938,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25863                 add_partial(n, page, DEACTIVATE_TO_TAIL);
25864                 stat(s, FREE_ADD_PARTIAL);
25865         }
25866 -       spin_unlock_irqrestore(&n->list_lock, flags);
25867 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25868         return;
25869
25870  slab_empty:
25871 @@ -2891,7 +2953,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25872                 remove_full(s, n, page);
25873         }
25874
25875 -       spin_unlock_irqrestore(&n->list_lock, flags);
25876 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25877         stat(s, FREE_SLAB);
25878         discard_slab(s, page);
25879  }
25880 @@ -3096,6 +3158,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25881                           void **p)
25882  {
25883         struct kmem_cache_cpu *c;
25884 +       LIST_HEAD(to_free);
25885         int i;
25886
25887         /* memcg and kmem_cache debug support */
25888 @@ -3119,7 +3182,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25889                          * of re-populating per CPU c->freelist
25890                          */
25891                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
25892 -                                           _RET_IP_, c);
25893 +                                           _RET_IP_, c, &to_free);
25894                         if (unlikely(!p[i]))
25895                                 goto error;
25896
25897 @@ -3131,6 +3194,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25898         }
25899         c->tid = next_tid(c->tid);
25900         local_irq_enable();
25901 +       free_delayed(&to_free);
25902
25903         /* Clear memory outside IRQ disabled fastpath loop */
25904         if (unlikely(flags & __GFP_ZERO)) {
25905 @@ -3278,7 +3342,7 @@ static void
25906  init_kmem_cache_node(struct kmem_cache_node *n)
25907  {
25908         n->nr_partial = 0;
25909 -       spin_lock_init(&n->list_lock);
25910 +       raw_spin_lock_init(&n->list_lock);
25911         INIT_LIST_HEAD(&n->partial);
25912  #ifdef CONFIG_SLUB_DEBUG
25913         atomic_long_set(&n->nr_slabs, 0);
25914 @@ -3622,6 +3686,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
25915                                                         const char *text)
25916  {
25917  #ifdef CONFIG_SLUB_DEBUG
25918 +#ifdef CONFIG_PREEMPT_RT_BASE
25919 +       /* XXX move out of irq-off section */
25920 +       slab_err(s, page, text, s->name);
25921 +#else
25922         void *addr = page_address(page);
25923         void *p;
25924         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
25925 @@ -3642,6 +3710,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
25926         slab_unlock(page);
25927         kfree(map);
25928  #endif
25929 +#endif
25930  }
25931
25932  /*
25933 @@ -3655,7 +3724,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
25934         struct page *page, *h;
25935
25936         BUG_ON(irqs_disabled());
25937 -       spin_lock_irq(&n->list_lock);
25938 +       raw_spin_lock_irq(&n->list_lock);
25939         list_for_each_entry_safe(page, h, &n->partial, lru) {
25940                 if (!page->inuse) {
25941                         remove_partial(n, page);
25942 @@ -3665,7 +3734,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
25943                         "Objects remaining in %s on __kmem_cache_shutdown()");
25944                 }
25945         }
25946 -       spin_unlock_irq(&n->list_lock);
25947 +       raw_spin_unlock_irq(&n->list_lock);
25948
25949         list_for_each_entry_safe(page, h, &discard, lru)
25950                 discard_slab(s, page);
25951 @@ -3908,7 +3977,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
25952                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
25953                         INIT_LIST_HEAD(promote + i);
25954
25955 -               spin_lock_irqsave(&n->list_lock, flags);
25956 +               raw_spin_lock_irqsave(&n->list_lock, flags);
25957
25958                 /*
25959                  * Build lists of slabs to discard or promote.
25960 @@ -3939,7 +4008,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
25961                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
25962                         list_splice(promote + i, &n->partial);
25963
25964 -               spin_unlock_irqrestore(&n->list_lock, flags);
25965 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
25966
25967                 /* Release empty slabs */
25968                 list_for_each_entry_safe(page, t, &discard, lru)
25969 @@ -4115,6 +4184,12 @@ void __init kmem_cache_init(void)
25970  {
25971         static __initdata struct kmem_cache boot_kmem_cache,
25972                 boot_kmem_cache_node;
25973 +       int cpu;
25974 +
25975 +       for_each_possible_cpu(cpu) {
25976 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
25977 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
25978 +       }
25979
25980         if (debug_guardpage_minorder())
25981                 slub_max_order = 0;
25982 @@ -4323,7 +4398,7 @@ static int validate_slab_node(struct kmem_cache *s,
25983         struct page *page;
25984         unsigned long flags;
25985
25986 -       spin_lock_irqsave(&n->list_lock, flags);
25987 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25988
25989         list_for_each_entry(page, &n->partial, lru) {
25990                 validate_slab_slab(s, page, map);
25991 @@ -4345,7 +4420,7 @@ static int validate_slab_node(struct kmem_cache *s,
25992                        s->name, count, atomic_long_read(&n->nr_slabs));
25993
25994  out:
25995 -       spin_unlock_irqrestore(&n->list_lock, flags);
25996 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25997         return count;
25998  }
25999
26000 @@ -4533,12 +4608,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
26001                 if (!atomic_long_read(&n->nr_slabs))
26002                         continue;
26003
26004 -               spin_lock_irqsave(&n->list_lock, flags);
26005 +               raw_spin_lock_irqsave(&n->list_lock, flags);
26006                 list_for_each_entry(page, &n->partial, lru)
26007                         process_slab(&t, s, page, alloc, map);
26008                 list_for_each_entry(page, &n->full, lru)
26009                         process_slab(&t, s, page, alloc, map);
26010 -               spin_unlock_irqrestore(&n->list_lock, flags);
26011 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
26012         }
26013
26014         for (i = 0; i < t.count; i++) {
26015 diff --git a/mm/swap.c b/mm/swap.c
26016 index 4dcf852e1e6d..69c3a5b24060 100644
26017 --- a/mm/swap.c
26018 +++ b/mm/swap.c
26019 @@ -32,6 +32,7 @@
26020  #include <linux/memcontrol.h>
26021  #include <linux/gfp.h>
26022  #include <linux/uio.h>
26023 +#include <linux/locallock.h>
26024  #include <linux/hugetlb.h>
26025  #include <linux/page_idle.h>
26026
26027 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
26028  #ifdef CONFIG_SMP
26029  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
26030  #endif
26031 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
26032 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
26033
26034  /*
26035   * This path almost never happens for VM activity - pages are normally
26036 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
26037                 unsigned long flags;
26038
26039                 get_page(page);
26040 -               local_irq_save(flags);
26041 +               local_lock_irqsave(rotate_lock, flags);
26042                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
26043                 if (!pagevec_add(pvec, page) || PageCompound(page))
26044                         pagevec_move_tail(pvec);
26045 -               local_irq_restore(flags);
26046 +               local_unlock_irqrestore(rotate_lock, flags);
26047         }
26048  }
26049
26050 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
26051  {
26052         page = compound_head(page);
26053         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
26054 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
26055 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
26056 +                                                      activate_page_pvecs);
26057
26058                 get_page(page);
26059                 if (!pagevec_add(pvec, page) || PageCompound(page))
26060                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
26061 -               put_cpu_var(activate_page_pvecs);
26062 +               put_locked_var(swapvec_lock, activate_page_pvecs);
26063         }
26064  }
26065
26066 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
26067
26068  static void __lru_cache_activate_page(struct page *page)
26069  {
26070 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
26071 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
26072         int i;
26073
26074         /*
26075 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
26076                 }
26077         }
26078
26079 -       put_cpu_var(lru_add_pvec);
26080 +       put_locked_var(swapvec_lock, lru_add_pvec);
26081  }
26082
26083  /*
26084 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
26085
26086  static void __lru_cache_add(struct page *page)
26087  {
26088 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
26089 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
26090
26091         get_page(page);
26092         if (!pagevec_add(pvec, page) || PageCompound(page))
26093                 __pagevec_lru_add(pvec);
26094 -       put_cpu_var(lru_add_pvec);
26095 +       put_locked_var(swapvec_lock, lru_add_pvec);
26096  }
26097
26098  /**
26099 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
26100                 unsigned long flags;
26101
26102                 /* No harm done if a racing interrupt already did this */
26103 -               local_irq_save(flags);
26104 +#ifdef CONFIG_PREEMPT_RT_BASE
26105 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
26106                 pagevec_move_tail(pvec);
26107 -               local_irq_restore(flags);
26108 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
26109 +#else
26110 +               local_lock_irqsave(rotate_lock, flags);
26111 +               pagevec_move_tail(pvec);
26112 +               local_unlock_irqrestore(rotate_lock, flags);
26113 +#endif
26114         }
26115
26116         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
26117 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
26118                 return;
26119
26120         if (likely(get_page_unless_zero(page))) {
26121 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
26122 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
26123 +                                                      lru_deactivate_file_pvecs);
26124
26125                 if (!pagevec_add(pvec, page) || PageCompound(page))
26126                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
26127 -               put_cpu_var(lru_deactivate_file_pvecs);
26128 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
26129         }
26130  }
26131
26132 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
26133  void deactivate_page(struct page *page)
26134  {
26135         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
26136 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
26137 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
26138 +                                                      lru_deactivate_pvecs);
26139
26140                 get_page(page);
26141                 if (!pagevec_add(pvec, page) || PageCompound(page))
26142                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
26143 -               put_cpu_var(lru_deactivate_pvecs);
26144 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
26145         }
26146  }
26147
26148  void lru_add_drain(void)
26149  {
26150 -       lru_add_drain_cpu(get_cpu());
26151 -       put_cpu();
26152 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
26153 +       local_unlock_cpu(swapvec_lock);
26154  }
26155
26156 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
26157 +#ifdef CONFIG_PREEMPT_RT_BASE
26158 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
26159  {
26160 -       lru_add_drain();
26161 +       local_lock_on(swapvec_lock, cpu);
26162 +       lru_add_drain_cpu(cpu);
26163 +       local_unlock_on(swapvec_lock, cpu);
26164  }
26165
26166 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
26167 +#else
26168
26169  /*
26170   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
26171 @@ -686,6 +701,22 @@ static int __init lru_init(void)
26172  }
26173  early_initcall(lru_init);
26174
26175 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
26176 +{
26177 +       lru_add_drain();
26178 +}
26179 +
26180 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
26181 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
26182 +{
26183 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
26184 +
26185 +       INIT_WORK(work, lru_add_drain_per_cpu);
26186 +       queue_work_on(cpu, lru_add_drain_wq, work);
26187 +       cpumask_set_cpu(cpu, has_work);
26188 +}
26189 +#endif
26190 +
26191  void lru_add_drain_all(void)
26192  {
26193         static DEFINE_MUTEX(lock);
26194 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
26195         cpumask_clear(&has_work);
26196
26197         for_each_online_cpu(cpu) {
26198 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
26199 -
26200                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
26201                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
26202                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
26203                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
26204 -                   need_activate_page_drain(cpu)) {
26205 -                       INIT_WORK(work, lru_add_drain_per_cpu);
26206 -                       queue_work_on(cpu, lru_add_drain_wq, work);
26207 -                       cpumask_set_cpu(cpu, &has_work);
26208 -               }
26209 +                   need_activate_page_drain(cpu))
26210 +                       remote_lru_add_drain(cpu, &has_work);
26211         }
26212
26213 +#ifndef CONFIG_PREEMPT_RT_BASE
26214         for_each_cpu(cpu, &has_work)
26215                 flush_work(&per_cpu(lru_add_drain_work, cpu));
26216 +#endif
26217
26218         put_online_cpus();
26219         mutex_unlock(&lock);
26220 diff --git a/mm/truncate.c b/mm/truncate.c
26221 index 9c809e7d73c3..b7681e888ba0 100644
26222 --- a/mm/truncate.c
26223 +++ b/mm/truncate.c
26224 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
26225          * protected by mapping->tree_lock.
26226          */
26227         if (!workingset_node_shadows(node) &&
26228 -           !list_empty(&node->private_list))
26229 -               list_lru_del(&workingset_shadow_nodes,
26230 +           !list_empty(&node->private_list)) {
26231 +               local_lock(workingset_shadow_lock);
26232 +               list_lru_del(&__workingset_shadow_nodes,
26233                                 &node->private_list);
26234 +               local_unlock(workingset_shadow_lock);
26235 +       }
26236         __radix_tree_delete_node(&mapping->page_tree, node);
26237  unlock:
26238         spin_unlock_irq(&mapping->tree_lock);
26239 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
26240 index 195de42bea1f..b46cb686fde7 100644
26241 --- a/mm/vmalloc.c
26242 +++ b/mm/vmalloc.c
26243 @@ -855,7 +855,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
26244         struct vmap_block *vb;
26245         struct vmap_area *va;
26246         unsigned long vb_idx;
26247 -       int node, err;
26248 +       int node, err, cpu;
26249         void *vaddr;
26250
26251         node = numa_node_id();
26252 @@ -898,11 +898,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
26253         BUG_ON(err);
26254         radix_tree_preload_end();
26255
26256 -       vbq = &get_cpu_var(vmap_block_queue);
26257 +       cpu = get_cpu_light();
26258 +       vbq = this_cpu_ptr(&vmap_block_queue);
26259         spin_lock(&vbq->lock);
26260         list_add_tail_rcu(&vb->free_list, &vbq->free);
26261         spin_unlock(&vbq->lock);
26262 -       put_cpu_var(vmap_block_queue);
26263 +       put_cpu_light();
26264
26265         return vaddr;
26266  }
26267 @@ -971,6 +972,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
26268         struct vmap_block *vb;
26269         void *vaddr = NULL;
26270         unsigned int order;
26271 +       int cpu;
26272
26273         BUG_ON(offset_in_page(size));
26274         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
26275 @@ -985,7 +987,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
26276         order = get_order(size);
26277
26278         rcu_read_lock();
26279 -       vbq = &get_cpu_var(vmap_block_queue);
26280 +       cpu = get_cpu_light();
26281 +       vbq = this_cpu_ptr(&vmap_block_queue);
26282         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
26283                 unsigned long pages_off;
26284
26285 @@ -1008,7 +1011,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
26286                 break;
26287         }
26288
26289 -       put_cpu_var(vmap_block_queue);
26290 +       put_cpu_light();
26291         rcu_read_unlock();
26292
26293         /* Allocate new block if nothing was found */
26294 diff --git a/mm/vmstat.c b/mm/vmstat.c
26295 index 604f26a4f696..312006d2db50 100644
26296 --- a/mm/vmstat.c
26297 +++ b/mm/vmstat.c
26298 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
26299         long x;
26300         long t;
26301
26302 +       preempt_disable_rt();
26303         x = delta + __this_cpu_read(*p);
26304
26305         t = __this_cpu_read(pcp->stat_threshold);
26306 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
26307                 x = 0;
26308         }
26309         __this_cpu_write(*p, x);
26310 +       preempt_enable_rt();
26311  }
26312  EXPORT_SYMBOL(__mod_zone_page_state);
26313
26314 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
26315         long x;
26316         long t;
26317
26318 +       preempt_disable_rt();
26319         x = delta + __this_cpu_read(*p);
26320
26321         t = __this_cpu_read(pcp->stat_threshold);
26322 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
26323                 x = 0;
26324         }
26325         __this_cpu_write(*p, x);
26326 +       preempt_enable_rt();
26327  }
26328  EXPORT_SYMBOL(__mod_node_page_state);
26329
26330 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
26331         s8 __percpu *p = pcp->vm_stat_diff + item;
26332         s8 v, t;
26333
26334 +       preempt_disable_rt();
26335         v = __this_cpu_inc_return(*p);
26336         t = __this_cpu_read(pcp->stat_threshold);
26337         if (unlikely(v > t)) {
26338 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
26339                 zone_page_state_add(v + overstep, zone, item);
26340                 __this_cpu_write(*p, -overstep);
26341         }
26342 +       preempt_enable_rt();
26343  }
26344
26345  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26346 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26347         s8 __percpu *p = pcp->vm_node_stat_diff + item;
26348         s8 v, t;
26349
26350 +       preempt_disable_rt();
26351         v = __this_cpu_inc_return(*p);
26352         t = __this_cpu_read(pcp->stat_threshold);
26353         if (unlikely(v > t)) {
26354 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26355                 node_page_state_add(v + overstep, pgdat, item);
26356                 __this_cpu_write(*p, -overstep);
26357         }
26358 +       preempt_enable_rt();
26359  }
26360
26361  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
26362 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
26363         s8 __percpu *p = pcp->vm_stat_diff + item;
26364         s8 v, t;
26365
26366 +       preempt_disable_rt();
26367         v = __this_cpu_dec_return(*p);
26368         t = __this_cpu_read(pcp->stat_threshold);
26369         if (unlikely(v < - t)) {
26370 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
26371                 zone_page_state_add(v - overstep, zone, item);
26372                 __this_cpu_write(*p, overstep);
26373         }
26374 +       preempt_enable_rt();
26375  }
26376
26377  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26378 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26379         s8 __percpu *p = pcp->vm_node_stat_diff + item;
26380         s8 v, t;
26381
26382 +       preempt_disable_rt();
26383         v = __this_cpu_dec_return(*p);
26384         t = __this_cpu_read(pcp->stat_threshold);
26385         if (unlikely(v < - t)) {
26386 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
26387                 node_page_state_add(v - overstep, pgdat, item);
26388                 __this_cpu_write(*p, overstep);
26389         }
26390 +       preempt_enable_rt();
26391  }
26392
26393  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
26394 diff --git a/mm/workingset.c b/mm/workingset.c
26395 index 4c4f05655e6e..b97b1e87b54c 100644
26396 --- a/mm/workingset.c
26397 +++ b/mm/workingset.c
26398 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
26399   * point where they would still be useful.
26400   */
26401
26402 -struct list_lru workingset_shadow_nodes;
26403 +struct list_lru __workingset_shadow_nodes;
26404 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
26405
26406  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
26407                                         struct shrink_control *sc)
26408 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
26409         unsigned long pages;
26410
26411         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
26412 -       local_irq_disable();
26413 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
26414 -       local_irq_enable();
26415 +       local_lock_irq(workingset_shadow_lock);
26416 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
26417 +       local_unlock_irq(workingset_shadow_lock);
26418
26419         if (sc->memcg) {
26420                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
26421 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
26422         spin_unlock(&mapping->tree_lock);
26423         ret = LRU_REMOVED_RETRY;
26424  out:
26425 -       local_irq_enable();
26426 +       local_unlock_irq(workingset_shadow_lock);
26427         cond_resched();
26428 -       local_irq_disable();
26429 +       local_lock_irq(workingset_shadow_lock);
26430         spin_lock(lru_lock);
26431         return ret;
26432  }
26433 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
26434         unsigned long ret;
26435
26436         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
26437 -       local_irq_disable();
26438 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
26439 +       local_lock_irq(workingset_shadow_lock);
26440 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
26441                                     shadow_lru_isolate, NULL);
26442 -       local_irq_enable();
26443 +       local_unlock_irq(workingset_shadow_lock);
26444         return ret;
26445  }
26446
26447 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
26448         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
26449                timestamp_bits, max_order, bucket_order);
26450
26451 -       ret = __list_lru_init(&workingset_shadow_nodes, true, &shadow_nodes_key);
26452 +       ret = __list_lru_init(&__workingset_shadow_nodes, true, &shadow_nodes_key);
26453         if (ret)
26454                 goto err;
26455         ret = register_shrinker(&workingset_shadow_shrinker);
26456 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
26457                 goto err_list_lru;
26458         return 0;
26459  err_list_lru:
26460 -       list_lru_destroy(&workingset_shadow_nodes);
26461 +       list_lru_destroy(&__workingset_shadow_nodes);
26462  err:
26463         return ret;
26464  }
26465 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
26466 index 1689bb58e0d1..e52a8cb6aa5a 100644
26467 --- a/mm/zsmalloc.c
26468 +++ b/mm/zsmalloc.c
26469 @@ -53,6 +53,7 @@
26470  #include <linux/mount.h>
26471  #include <linux/migrate.h>
26472  #include <linux/pagemap.h>
26473 +#include <linux/locallock.h>
26474
26475  #define ZSPAGE_MAGIC   0x58
26476
26477 @@ -70,9 +71,22 @@
26478   */
26479  #define ZS_MAX_ZSPAGE_ORDER 2
26480  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
26481 -
26482  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
26483
26484 +#ifdef CONFIG_PREEMPT_RT_FULL
26485 +
26486 +struct zsmalloc_handle {
26487 +       unsigned long addr;
26488 +       struct mutex lock;
26489 +};
26490 +
26491 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
26492 +
26493 +#else
26494 +
26495 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
26496 +#endif
26497 +
26498  /*
26499   * Object location (<PFN>, <obj_idx>) is encoded as
26500   * as single (unsigned long) handle value.
26501 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
26502
26503  static int create_cache(struct zs_pool *pool)
26504  {
26505 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
26506 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
26507                                         0, 0, NULL);
26508         if (!pool->handle_cachep)
26509                 return 1;
26510 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
26511
26512  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
26513  {
26514 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
26515 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
26516 +       void *p;
26517 +
26518 +       p = kmem_cache_alloc(pool->handle_cachep,
26519 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
26520 +#ifdef CONFIG_PREEMPT_RT_FULL
26521 +       if (p) {
26522 +               struct zsmalloc_handle *zh = p;
26523 +
26524 +               mutex_init(&zh->lock);
26525 +       }
26526 +#endif
26527 +       return (unsigned long)p;
26528  }
26529
26530 +#ifdef CONFIG_PREEMPT_RT_FULL
26531 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
26532 +{
26533 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
26534 +}
26535 +#endif
26536 +
26537  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
26538  {
26539         kmem_cache_free(pool->handle_cachep, (void *)handle);
26540 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
26541
26542  static void record_obj(unsigned long handle, unsigned long obj)
26543  {
26544 +#ifdef CONFIG_PREEMPT_RT_FULL
26545 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26546 +
26547 +       WRITE_ONCE(zh->addr, obj);
26548 +#else
26549         /*
26550          * lsb of @obj represents handle lock while other bits
26551          * represent object value the handle is pointing so
26552          * updating shouldn't do store tearing.
26553          */
26554         WRITE_ONCE(*(unsigned long *)handle, obj);
26555 +#endif
26556  }
26557
26558  /* zpool driver */
26559 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
26560
26561  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
26562  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
26563 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
26564
26565  static bool is_zspage_isolated(struct zspage *zspage)
26566  {
26567 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
26568
26569  static unsigned long handle_to_obj(unsigned long handle)
26570  {
26571 +#ifdef CONFIG_PREEMPT_RT_FULL
26572 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26573 +
26574 +       return zh->addr;
26575 +#else
26576         return *(unsigned long *)handle;
26577 +#endif
26578  }
26579
26580  static unsigned long obj_to_head(struct page *page, void *obj)
26581 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
26582
26583  static inline int testpin_tag(unsigned long handle)
26584  {
26585 +#ifdef CONFIG_PREEMPT_RT_FULL
26586 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26587 +
26588 +       return mutex_is_locked(&zh->lock);
26589 +#else
26590         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
26591 +#endif
26592  }
26593
26594  static inline int trypin_tag(unsigned long handle)
26595  {
26596 +#ifdef CONFIG_PREEMPT_RT_FULL
26597 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26598 +
26599 +       return mutex_trylock(&zh->lock);
26600 +#else
26601         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
26602 +#endif
26603  }
26604
26605  static void pin_tag(unsigned long handle)
26606  {
26607 +#ifdef CONFIG_PREEMPT_RT_FULL
26608 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26609 +
26610 +       return mutex_lock(&zh->lock);
26611 +#else
26612         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
26613 +#endif
26614  }
26615
26616  static void unpin_tag(unsigned long handle)
26617  {
26618 +#ifdef CONFIG_PREEMPT_RT_FULL
26619 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26620 +
26621 +       return mutex_unlock(&zh->lock);
26622 +#else
26623         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
26624 +#endif
26625  }
26626
26627  static void reset_page(struct page *page)
26628 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
26629         class = pool->size_class[class_idx];
26630         off = (class->size * obj_idx) & ~PAGE_MASK;
26631
26632 -       area = &get_cpu_var(zs_map_area);
26633 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
26634         area->vm_mm = mm;
26635         if (off + class->size <= PAGE_SIZE) {
26636                 /* this object is contained entirely within a page */
26637 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
26638
26639                 __zs_unmap_object(area, pages, off, class->size);
26640         }
26641 -       put_cpu_var(zs_map_area);
26642 +       put_locked_var(zs_map_area_lock, zs_map_area);
26643
26644         migrate_read_unlock(zspage);
26645         unpin_tag(handle);
26646 diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
26647 index c88a6007e643..5de85b55a821 100644
26648 --- a/net/bluetooth/hci_sock.c
26649 +++ b/net/bluetooth/hci_sock.c
26650 @@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
26651  }
26652
26653  /* Send frame to sockets with specific channel */
26654 -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26655 -                        int flag, struct sock *skip_sk)
26656 +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26657 +                                 int flag, struct sock *skip_sk)
26658  {
26659         struct sock *sk;
26660
26661         BT_DBG("channel %u len %d", channel, skb->len);
26662
26663 -       read_lock(&hci_sk_list.lock);
26664 -
26665         sk_for_each(sk, &hci_sk_list.head) {
26666                 struct sk_buff *nskb;
26667
26668 @@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26669                         kfree_skb(nskb);
26670         }
26671
26672 +}
26673 +
26674 +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
26675 +                        int flag, struct sock *skip_sk)
26676 +{
26677 +       read_lock(&hci_sk_list.lock);
26678 +       __hci_send_to_channel(channel, skb, flag, skip_sk);
26679         read_unlock(&hci_sk_list.lock);
26680  }
26681
26682 @@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event,
26683                 hdr->index = index;
26684                 hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
26685
26686 -               hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
26687 -                                   HCI_SOCK_TRUSTED, NULL);
26688 +               __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
26689 +                                     HCI_SOCK_TRUSTED, NULL);
26690                 kfree_skb(skb);
26691         }
26692
26693 diff --git a/net/core/dev.c b/net/core/dev.c
26694 index c37891828e4e..3235360cd9a4 100644
26695 --- a/net/core/dev.c
26696 +++ b/net/core/dev.c
26697 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
26698  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
26699
26700  static seqcount_t devnet_rename_seq;
26701 +static DEFINE_MUTEX(devnet_rename_mutex);
26702
26703  static inline void dev_base_seq_inc(struct net *net)
26704  {
26705 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
26706  static inline void rps_lock(struct softnet_data *sd)
26707  {
26708  #ifdef CONFIG_RPS
26709 -       spin_lock(&sd->input_pkt_queue.lock);
26710 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
26711  #endif
26712  }
26713
26714  static inline void rps_unlock(struct softnet_data *sd)
26715  {
26716  #ifdef CONFIG_RPS
26717 -       spin_unlock(&sd->input_pkt_queue.lock);
26718 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
26719  #endif
26720  }
26721
26722 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
26723         strcpy(name, dev->name);
26724         rcu_read_unlock();
26725         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
26726 -               cond_resched();
26727 +               mutex_lock(&devnet_rename_mutex);
26728 +               mutex_unlock(&devnet_rename_mutex);
26729                 goto retry;
26730         }
26731
26732 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
26733         if (dev->flags & IFF_UP)
26734                 return -EBUSY;
26735
26736 -       write_seqcount_begin(&devnet_rename_seq);
26737 +       mutex_lock(&devnet_rename_mutex);
26738 +       __raw_write_seqcount_begin(&devnet_rename_seq);
26739
26740 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
26741 -               write_seqcount_end(&devnet_rename_seq);
26742 -               return 0;
26743 -       }
26744 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
26745 +               goto outunlock;
26746
26747         memcpy(oldname, dev->name, IFNAMSIZ);
26748
26749         err = dev_get_valid_name(net, dev, newname);
26750 -       if (err < 0) {
26751 -               write_seqcount_end(&devnet_rename_seq);
26752 -               return err;
26753 -       }
26754 +       if (err < 0)
26755 +               goto outunlock;
26756
26757         if (oldname[0] && !strchr(oldname, '%'))
26758                 netdev_info(dev, "renamed from %s\n", oldname);
26759 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
26760         if (ret) {
26761                 memcpy(dev->name, oldname, IFNAMSIZ);
26762                 dev->name_assign_type = old_assign_type;
26763 -               write_seqcount_end(&devnet_rename_seq);
26764 -               return ret;
26765 +               err = ret;
26766 +               goto outunlock;
26767         }
26768
26769 -       write_seqcount_end(&devnet_rename_seq);
26770 +       __raw_write_seqcount_end(&devnet_rename_seq);
26771 +       mutex_unlock(&devnet_rename_mutex);
26772
26773         netdev_adjacent_rename_links(dev, oldname);
26774
26775 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
26776                 /* err >= 0 after dev_alloc_name() or stores the first errno */
26777                 if (err >= 0) {
26778                         err = ret;
26779 -                       write_seqcount_begin(&devnet_rename_seq);
26780 +                       mutex_lock(&devnet_rename_mutex);
26781 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
26782                         memcpy(dev->name, oldname, IFNAMSIZ);
26783                         memcpy(oldname, newname, IFNAMSIZ);
26784                         dev->name_assign_type = old_assign_type;
26785 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
26786         }
26787
26788         return err;
26789 +
26790 +outunlock:
26791 +       __raw_write_seqcount_end(&devnet_rename_seq);
26792 +       mutex_unlock(&devnet_rename_mutex);
26793 +       return err;
26794  }
26795
26796  /**
26797 @@ -2286,6 +2292,7 @@ static void __netif_reschedule(struct Qdisc *q)
26798         sd->output_queue_tailp = &q->next_sched;
26799         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26800         local_irq_restore(flags);
26801 +       preempt_check_resched_rt();
26802  }
26803
26804  void __netif_schedule(struct Qdisc *q)
26805 @@ -2370,6 +2377,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
26806         __this_cpu_write(softnet_data.completion_queue, skb);
26807         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26808         local_irq_restore(flags);
26809 +       preempt_check_resched_rt();
26810  }
26811  EXPORT_SYMBOL(__dev_kfree_skb_irq);
26812
26813 @@ -3111,7 +3119,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
26814          * This permits qdisc->running owner to get the lock more
26815          * often and dequeue packets faster.
26816          */
26817 +#ifdef CONFIG_PREEMPT_RT_FULL
26818 +       contended = true;
26819 +#else
26820         contended = qdisc_is_running(q);
26821 +#endif
26822         if (unlikely(contended))
26823                 spin_lock(&q->busylock);
26824
26825 @@ -3174,8 +3186,10 @@ static void skb_update_prio(struct sk_buff *skb)
26826  #define skb_update_prio(skb)
26827  #endif
26828
26829 +#ifndef CONFIG_PREEMPT_RT_FULL
26830  DEFINE_PER_CPU(int, xmit_recursion);
26831  EXPORT_SYMBOL(xmit_recursion);
26832 +#endif
26833
26834  /**
26835   *     dev_loopback_xmit - loop back @skb
26836 @@ -3409,8 +3423,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
26837                 int cpu = smp_processor_id(); /* ok because BHs are off */
26838
26839                 if (txq->xmit_lock_owner != cpu) {
26840 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
26841 -                                    XMIT_RECURSION_LIMIT))
26842 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
26843                                 goto recursion_alert;
26844
26845                         skb = validate_xmit_skb(skb, dev);
26846 @@ -3420,9 +3433,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
26847                         HARD_TX_LOCK(dev, txq, cpu);
26848
26849                         if (!netif_xmit_stopped(txq)) {
26850 -                               __this_cpu_inc(xmit_recursion);
26851 +                               xmit_rec_inc();
26852                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
26853 -                               __this_cpu_dec(xmit_recursion);
26854 +                               xmit_rec_dec();
26855                                 if (dev_xmit_complete(rc)) {
26856                                         HARD_TX_UNLOCK(dev, txq);
26857                                         goto out;
26858 @@ -3796,6 +3809,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
26859         rps_unlock(sd);
26860
26861         local_irq_restore(flags);
26862 +       preempt_check_resched_rt();
26863
26864         atomic_long_inc(&skb->dev->rx_dropped);
26865         kfree_skb(skb);
26866 @@ -3814,7 +3828,7 @@ static int netif_rx_internal(struct sk_buff *skb)
26867                 struct rps_dev_flow voidflow, *rflow = &voidflow;
26868                 int cpu;
26869
26870 -               preempt_disable();
26871 +               migrate_disable();
26872                 rcu_read_lock();
26873
26874                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
26875 @@ -3824,13 +3838,13 @@ static int netif_rx_internal(struct sk_buff *skb)
26876                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
26877
26878                 rcu_read_unlock();
26879 -               preempt_enable();
26880 +               migrate_enable();
26881         } else
26882  #endif
26883         {
26884                 unsigned int qtail;
26885 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
26886 -               put_cpu();
26887 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
26888 +               put_cpu_light();
26889         }
26890         return ret;
26891  }
26892 @@ -3864,11 +3878,9 @@ int netif_rx_ni(struct sk_buff *skb)
26893
26894         trace_netif_rx_ni_entry(skb);
26895
26896 -       preempt_disable();
26897 +       local_bh_disable();
26898         err = netif_rx_internal(skb);
26899 -       if (local_softirq_pending())
26900 -               do_softirq();
26901 -       preempt_enable();
26902 +       local_bh_enable();
26903
26904         return err;
26905  }
26906 @@ -4347,7 +4359,7 @@ static void flush_backlog(struct work_struct *work)
26907         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
26908                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
26909                         __skb_unlink(skb, &sd->input_pkt_queue);
26910 -                       kfree_skb(skb);
26911 +                       __skb_queue_tail(&sd->tofree_queue, skb);
26912                         input_queue_head_incr(sd);
26913                 }
26914         }
26915 @@ -4357,11 +4369,14 @@ static void flush_backlog(struct work_struct *work)
26916         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
26917                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
26918                         __skb_unlink(skb, &sd->process_queue);
26919 -                       kfree_skb(skb);
26920 +                       __skb_queue_tail(&sd->tofree_queue, skb);
26921                         input_queue_head_incr(sd);
26922                 }
26923         }
26924 +       if (!skb_queue_empty(&sd->tofree_queue))
26925 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
26926         local_bh_enable();
26927 +
26928  }
26929
26930  static void flush_all_backlogs(void)
26931 @@ -4852,6 +4867,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
26932                 sd->rps_ipi_list = NULL;
26933
26934                 local_irq_enable();
26935 +               preempt_check_resched_rt();
26936
26937                 /* Send pending IPI's to kick RPS processing on remote cpus. */
26938                 while (remsd) {
26939 @@ -4865,6 +4881,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
26940         } else
26941  #endif
26942                 local_irq_enable();
26943 +       preempt_check_resched_rt();
26944  }
26945
26946  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
26947 @@ -4894,7 +4911,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
26948         while (again) {
26949                 struct sk_buff *skb;
26950
26951 +               local_irq_disable();
26952                 while ((skb = __skb_dequeue(&sd->process_queue))) {
26953 +                       local_irq_enable();
26954                         rcu_read_lock();
26955                         __netif_receive_skb(skb);
26956                         rcu_read_unlock();
26957 @@ -4902,9 +4921,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
26958                         if (++work >= quota)
26959                                 return work;
26960
26961 +                       local_irq_disable();
26962                 }
26963
26964 -               local_irq_disable();
26965                 rps_lock(sd);
26966                 if (skb_queue_empty(&sd->input_pkt_queue)) {
26967                         /*
26968 @@ -4942,9 +4961,11 @@ void __napi_schedule(struct napi_struct *n)
26969         local_irq_save(flags);
26970         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
26971         local_irq_restore(flags);
26972 +       preempt_check_resched_rt();
26973  }
26974  EXPORT_SYMBOL(__napi_schedule);
26975
26976 +#ifndef CONFIG_PREEMPT_RT_FULL
26977  /**
26978   * __napi_schedule_irqoff - schedule for receive
26979   * @n: entry to schedule
26980 @@ -4956,6 +4977,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
26981         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
26982  }
26983  EXPORT_SYMBOL(__napi_schedule_irqoff);
26984 +#endif
26985
26986  void __napi_complete(struct napi_struct *n)
26987  {
26988 @@ -5245,13 +5267,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
26989         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
26990         unsigned long time_limit = jiffies + 2;
26991         int budget = netdev_budget;
26992 +       struct sk_buff_head tofree_q;
26993 +       struct sk_buff *skb;
26994         LIST_HEAD(list);
26995         LIST_HEAD(repoll);
26996
26997 +       __skb_queue_head_init(&tofree_q);
26998 +
26999         local_irq_disable();
27000 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
27001         list_splice_init(&sd->poll_list, &list);
27002         local_irq_enable();
27003
27004 +       while ((skb = __skb_dequeue(&tofree_q)))
27005 +               kfree_skb(skb);
27006 +
27007         for (;;) {
27008                 struct napi_struct *n;
27009
27010 @@ -5282,7 +5312,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
27011         list_splice_tail(&repoll, &list);
27012         list_splice(&list, &sd->poll_list);
27013         if (!list_empty(&sd->poll_list))
27014 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
27015 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
27016
27017         net_rps_action_and_irq_enable(sd);
27018  }
27019 @@ -8044,16 +8074,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
27020
27021         raise_softirq_irqoff(NET_TX_SOFTIRQ);
27022         local_irq_enable();
27023 +       preempt_check_resched_rt();
27024
27025         /* Process offline CPU's input_pkt_queue */
27026         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
27027                 netif_rx_ni(skb);
27028                 input_queue_head_incr(oldsd);
27029         }
27030 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
27031 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
27032                 netif_rx_ni(skb);
27033                 input_queue_head_incr(oldsd);
27034         }
27035 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
27036 +               kfree_skb(skb);
27037 +       }
27038
27039         return NOTIFY_OK;
27040  }
27041 @@ -8358,8 +8392,9 @@ static int __init net_dev_init(void)
27042
27043                 INIT_WORK(flush, flush_backlog);
27044
27045 -               skb_queue_head_init(&sd->input_pkt_queue);
27046 -               skb_queue_head_init(&sd->process_queue);
27047 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
27048 +               skb_queue_head_init_raw(&sd->process_queue);
27049 +               skb_queue_head_init_raw(&sd->tofree_queue);
27050                 INIT_LIST_HEAD(&sd->poll_list);
27051                 sd->output_queue_tailp = &sd->output_queue;
27052  #ifdef CONFIG_RPS
27053 diff --git a/net/core/filter.c b/net/core/filter.c
27054 index 4eb4ce0aeef4..4f09d6a57217 100644
27055 --- a/net/core/filter.c
27056 +++ b/net/core/filter.c
27057 @@ -1645,7 +1645,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
27058  {
27059         int ret;
27060
27061 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
27062 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
27063                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
27064                 kfree_skb(skb);
27065                 return -ENETDOWN;
27066 @@ -1653,9 +1653,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
27067
27068         skb->dev = dev;
27069
27070 -       __this_cpu_inc(xmit_recursion);
27071 +       xmit_rec_inc();
27072         ret = dev_queue_xmit(skb);
27073 -       __this_cpu_dec(xmit_recursion);
27074 +       xmit_rec_dec();
27075
27076         return ret;
27077  }
27078 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
27079 index cad8e791f28e..2a9364fe62a5 100644
27080 --- a/net/core/gen_estimator.c
27081 +++ b/net/core/gen_estimator.c
27082 @@ -84,7 +84,7 @@ struct gen_estimator
27083         struct gnet_stats_basic_packed  *bstats;
27084         struct gnet_stats_rate_est64    *rate_est;
27085         spinlock_t              *stats_lock;
27086 -       seqcount_t              *running;
27087 +       net_seqlock_t           *running;
27088         int                     ewma_log;
27089         u32                     last_packets;
27090         unsigned long           avpps;
27091 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
27092                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
27093                       struct gnet_stats_rate_est64 *rate_est,
27094                       spinlock_t *stats_lock,
27095 -                     seqcount_t *running,
27096 +                     net_seqlock_t *running,
27097                       struct nlattr *opt)
27098  {
27099         struct gen_estimator *est;
27100 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
27101                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
27102                           struct gnet_stats_rate_est64 *rate_est,
27103                           spinlock_t *stats_lock,
27104 -                         seqcount_t *running, struct nlattr *opt)
27105 +                         net_seqlock_t *running, struct nlattr *opt)
27106  {
27107         gen_kill_estimator(bstats, rate_est);
27108         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
27109 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
27110 index 508e051304fb..bc3b17b78c94 100644
27111 --- a/net/core/gen_stats.c
27112 +++ b/net/core/gen_stats.c
27113 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
27114  }
27115
27116  void
27117 -__gnet_stats_copy_basic(const seqcount_t *running,
27118 +__gnet_stats_copy_basic(net_seqlock_t *running,
27119                         struct gnet_stats_basic_packed *bstats,
27120                         struct gnet_stats_basic_cpu __percpu *cpu,
27121                         struct gnet_stats_basic_packed *b)
27122 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
27123         }
27124         do {
27125                 if (running)
27126 -                       seq = read_seqcount_begin(running);
27127 +                       seq = net_seq_begin(running);
27128                 bstats->bytes = b->bytes;
27129                 bstats->packets = b->packets;
27130 -       } while (running && read_seqcount_retry(running, seq));
27131 +       } while (running && net_seq_retry(running, seq));
27132  }
27133  EXPORT_SYMBOL(__gnet_stats_copy_basic);
27134
27135 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
27136   * if the room in the socket buffer was not sufficient.
27137   */
27138  int
27139 -gnet_stats_copy_basic(const seqcount_t *running,
27140 +gnet_stats_copy_basic(net_seqlock_t *running,
27141                       struct gnet_dump *d,
27142                       struct gnet_stats_basic_cpu __percpu *cpu,
27143                       struct gnet_stats_basic_packed *b)
27144 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
27145 index aec5605944d3..d4cb8bfdb83c 100644
27146 --- a/net/core/skbuff.c
27147 +++ b/net/core/skbuff.c
27148 @@ -64,6 +64,7 @@
27149  #include <linux/errqueue.h>
27150  #include <linux/prefetch.h>
27151  #include <linux/if_vlan.h>
27152 +#include <linux/locallock.h>
27153
27154  #include <net/protocol.h>
27155  #include <net/dst.h>
27156 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
27157
27158  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
27159  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
27160 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
27161 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
27162
27163  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
27164  {
27165 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
27166         unsigned long flags;
27167         void *data;
27168
27169 -       local_irq_save(flags);
27170 +       local_lock_irqsave(netdev_alloc_lock, flags);
27171         nc = this_cpu_ptr(&netdev_alloc_cache);
27172         data = __alloc_page_frag(nc, fragsz, gfp_mask);
27173 -       local_irq_restore(flags);
27174 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
27175         return data;
27176  }
27177
27178 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
27179
27180  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
27181  {
27182 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27183 +       struct napi_alloc_cache *nc;
27184 +       void *data;
27185
27186 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
27187 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27188 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
27189 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27190 +       return data;
27191  }
27192
27193  void *napi_alloc_frag(unsigned int fragsz)
27194 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
27195         if (sk_memalloc_socks())
27196                 gfp_mask |= __GFP_MEMALLOC;
27197
27198 -       local_irq_save(flags);
27199 +       local_lock_irqsave(netdev_alloc_lock, flags);
27200
27201         nc = this_cpu_ptr(&netdev_alloc_cache);
27202         data = __alloc_page_frag(nc, len, gfp_mask);
27203         pfmemalloc = nc->pfmemalloc;
27204
27205 -       local_irq_restore(flags);
27206 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
27207
27208         if (unlikely(!data))
27209                 return NULL;
27210 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
27211  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
27212                                  gfp_t gfp_mask)
27213  {
27214 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27215 +       struct napi_alloc_cache *nc;
27216         struct sk_buff *skb;
27217         void *data;
27218 +       bool pfmemalloc;
27219
27220         len += NET_SKB_PAD + NET_IP_ALIGN;
27221
27222 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
27223         if (sk_memalloc_socks())
27224                 gfp_mask |= __GFP_MEMALLOC;
27225
27226 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27227         data = __alloc_page_frag(&nc->page, len, gfp_mask);
27228 +       pfmemalloc = nc->page.pfmemalloc;
27229 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27230         if (unlikely(!data))
27231                 return NULL;
27232
27233 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
27234         }
27235
27236         /* use OR instead of assignment to avoid clearing of bits in mask */
27237 -       if (nc->page.pfmemalloc)
27238 +       if (pfmemalloc)
27239                 skb->pfmemalloc = 1;
27240         skb->head_frag = 1;
27241
27242 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
27243
27244  void __kfree_skb_flush(void)
27245  {
27246 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27247 +       struct napi_alloc_cache *nc;
27248
27249 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27250         /* flush skb_cache if containing objects */
27251         if (nc->skb_count) {
27252                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
27253                                      nc->skb_cache);
27254                 nc->skb_count = 0;
27255         }
27256 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27257  }
27258
27259  static inline void _kfree_skb_defer(struct sk_buff *skb)
27260  {
27261 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
27262 +       struct napi_alloc_cache *nc;
27263
27264         /* drop skb->head and call any destructors for packet */
27265         skb_release_all(skb);
27266
27267 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27268         /* record skb to CPU local list */
27269         nc->skb_cache[nc->skb_count++] = skb;
27270
27271 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
27272                                      nc->skb_cache);
27273                 nc->skb_count = 0;
27274         }
27275 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
27276  }
27277  void __kfree_skb_defer(struct sk_buff *skb)
27278  {
27279 diff --git a/net/core/sock.c b/net/core/sock.c
27280 index e3b60460dc9c..8d15848c3a22 100644
27281 --- a/net/core/sock.c
27282 +++ b/net/core/sock.c
27283 @@ -2493,12 +2493,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
27284         if (sk->sk_lock.owned)
27285                 __lock_sock(sk);
27286         sk->sk_lock.owned = 1;
27287 -       spin_unlock(&sk->sk_lock.slock);
27288 +       spin_unlock_bh(&sk->sk_lock.slock);
27289         /*
27290          * The sk_lock has mutex_lock() semantics here:
27291          */
27292         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
27293 -       local_bh_enable();
27294  }
27295  EXPORT_SYMBOL(lock_sock_nested);
27296
27297 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
27298 index 48734ee6293f..330224ef4174 100644
27299 --- a/net/ipv4/icmp.c
27300 +++ b/net/ipv4/icmp.c
27301 @@ -69,6 +69,7 @@
27302  #include <linux/jiffies.h>
27303  #include <linux/kernel.h>
27304  #include <linux/fcntl.h>
27305 +#include <linux/sysrq.h>
27306  #include <linux/socket.h>
27307  #include <linux/in.h>
27308  #include <linux/inet.h>
27309 @@ -77,6 +78,7 @@
27310  #include <linux/string.h>
27311  #include <linux/netfilter_ipv4.h>
27312  #include <linux/slab.h>
27313 +#include <linux/locallock.h>
27314  #include <net/snmp.h>
27315  #include <net/ip.h>
27316  #include <net/route.h>
27317 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
27318   *
27319   *     On SMP we have one ICMP socket per-cpu.
27320   */
27321 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
27322 +
27323  static struct sock *icmp_sk(struct net *net)
27324  {
27325         return *this_cpu_ptr(net->ipv4.icmp_sk);
27326 @@ -215,12 +219,18 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
27327
27328         local_bh_disable();
27329
27330 +       if (!local_trylock(icmp_sk_lock)) {
27331 +               local_bh_enable();
27332 +               return NULL;
27333 +       }
27334 +
27335         sk = icmp_sk(net);
27336
27337         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
27338                 /* This can happen if the output path signals a
27339                  * dst_link_failure() for an outgoing ICMP packet.
27340                  */
27341 +               local_unlock(icmp_sk_lock);
27342                 local_bh_enable();
27343                 return NULL;
27344         }
27345 @@ -230,6 +240,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
27346  static inline void icmp_xmit_unlock(struct sock *sk)
27347  {
27348         spin_unlock_bh(&sk->sk_lock.slock);
27349 +       local_unlock(icmp_sk_lock);
27350  }
27351
27352  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
27353 @@ -358,6 +369,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
27354         struct sock *sk;
27355         struct sk_buff *skb;
27356
27357 +       local_lock(icmp_sk_lock);
27358         sk = icmp_sk(dev_net((*rt)->dst.dev));
27359         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
27360                            icmp_param->data_len+icmp_param->head_len,
27361 @@ -380,6 +392,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
27362                 skb->ip_summed = CHECKSUM_NONE;
27363                 ip_push_pending_frames(sk, fl4);
27364         }
27365 +       local_unlock(icmp_sk_lock);
27366  }
27367
27368  /*
27369 @@ -891,6 +904,30 @@ static bool icmp_redirect(struct sk_buff *skb)
27370  }
27371
27372  /*
27373 + * 32bit and 64bit have different timestamp length, so we check for
27374 + * the cookie at offset 20 and verify it is repeated at offset 50
27375 + */
27376 +#define CO_POS0                20
27377 +#define CO_POS1                50
27378 +#define CO_SIZE                sizeof(int)
27379 +#define ICMP_SYSRQ_SIZE        57
27380 +
27381 +/*
27382 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
27383 + * pattern and if it matches send the next byte as a trigger to sysrq.
27384 + */
27385 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
27386 +{
27387 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
27388 +       char *p = skb->data;
27389 +
27390 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
27391 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
27392 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
27393 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
27394 +}
27395 +
27396 +/*
27397   *     Handle ICMP_ECHO ("ping") requests.
27398   *
27399   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
27400 @@ -917,6 +954,11 @@ static bool icmp_echo(struct sk_buff *skb)
27401                 icmp_param.data_len        = skb->len;
27402                 icmp_param.head_len        = sizeof(struct icmphdr);
27403                 icmp_reply(&icmp_param, skb);
27404 +
27405 +               if (skb->len == ICMP_SYSRQ_SIZE &&
27406 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
27407 +                       icmp_check_sysrq(net, skb);
27408 +               }
27409         }
27410         /* should there be an ICMP stat for ignored echos? */
27411         return true;
27412 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
27413 index 566cfc50f7cf..4b8551d78a3b 100644
27414 --- a/net/ipv4/sysctl_net_ipv4.c
27415 +++ b/net/ipv4/sysctl_net_ipv4.c
27416 @@ -681,6 +681,13 @@ static struct ctl_table ipv4_net_table[] = {
27417                 .proc_handler   = proc_dointvec
27418         },
27419         {
27420 +               .procname       = "icmp_echo_sysrq",
27421 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
27422 +               .maxlen         = sizeof(int),
27423 +               .mode           = 0644,
27424 +               .proc_handler   = proc_dointvec
27425 +       },
27426 +       {
27427                 .procname       = "icmp_ignore_bogus_error_responses",
27428                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
27429                 .maxlen         = sizeof(int),
27430 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
27431 index d577ec07a0d8..3b7298459c87 100644
27432 --- a/net/ipv4/tcp_ipv4.c
27433 +++ b/net/ipv4/tcp_ipv4.c
27434 @@ -62,6 +62,7 @@
27435  #include <linux/init.h>
27436  #include <linux/times.h>
27437  #include <linux/slab.h>
27438 +#include <linux/locallock.h>
27439
27440  #include <net/net_namespace.h>
27441  #include <net/icmp.h>
27442 @@ -568,6 +569,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
27443  }
27444  EXPORT_SYMBOL(tcp_v4_send_check);
27445
27446 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
27447  /*
27448   *     This routine will send an RST to the other tcp.
27449   *
27450 @@ -695,7 +697,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
27451                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
27452
27453         arg.tos = ip_hdr(skb)->tos;
27454 +
27455         local_bh_disable();
27456 +       local_lock(tcp_sk_lock);
27457         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
27458                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
27459                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
27460 @@ -703,6 +707,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
27461
27462         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
27463         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
27464 +       local_unlock(tcp_sk_lock);
27465         local_bh_enable();
27466
27467  #ifdef CONFIG_TCP_MD5SIG
27468 @@ -780,12 +785,14 @@ static void tcp_v4_send_ack(struct net *net,
27469                 arg.bound_dev_if = oif;
27470         arg.tos = tos;
27471         local_bh_disable();
27472 +       local_lock(tcp_sk_lock);
27473         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
27474                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
27475                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
27476                               &arg, arg.iov[0].iov_len);
27477
27478         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
27479 +       local_unlock(tcp_sk_lock);
27480         local_bh_enable();
27481  }
27482
27483 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
27484 index 439e597fd374..ca0daeaff370 100644
27485 --- a/net/mac80211/rx.c
27486 +++ b/net/mac80211/rx.c
27487 @@ -4229,7 +4229,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
27488         struct ieee80211_supported_band *sband;
27489         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
27490
27491 -       WARN_ON_ONCE(softirq_count() == 0);
27492 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
27493
27494         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
27495                 goto drop;
27496 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
27497 index 004af030ef1a..b64f751bda45 100644
27498 --- a/net/netfilter/core.c
27499 +++ b/net/netfilter/core.c
27500 @@ -22,12 +22,18 @@
27501  #include <linux/proc_fs.h>
27502  #include <linux/mutex.h>
27503  #include <linux/slab.h>
27504 +#include <linux/locallock.h>
27505  #include <linux/rcupdate.h>
27506  #include <net/net_namespace.h>
27507  #include <net/sock.h>
27508
27509  #include "nf_internals.h"
27510
27511 +#ifdef CONFIG_PREEMPT_RT_BASE
27512 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
27513 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
27514 +#endif
27515 +
27516  static DEFINE_MUTEX(afinfo_mutex);
27517
27518  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
27519 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
27520 index e7f6657269e0..f635b773d28f 100644
27521 --- a/net/packet/af_packet.c
27522 +++ b/net/packet/af_packet.c
27523 @@ -63,6 +63,7 @@
27524  #include <linux/if_packet.h>
27525  #include <linux/wireless.h>
27526  #include <linux/kernel.h>
27527 +#include <linux/delay.h>
27528  #include <linux/kmod.h>
27529  #include <linux/slab.h>
27530  #include <linux/vmalloc.h>
27531 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
27532         if (BLOCK_NUM_PKTS(pbd)) {
27533                 while (atomic_read(&pkc->blk_fill_in_prog)) {
27534                         /* Waiting for skb_copy_bits to finish... */
27535 -                       cpu_relax();
27536 +                       cpu_chill();
27537                 }
27538         }
27539
27540 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
27541                 if (!(status & TP_STATUS_BLK_TMO)) {
27542                         while (atomic_read(&pkc->blk_fill_in_prog)) {
27543                                 /* Waiting for skb_copy_bits to finish... */
27544 -                               cpu_relax();
27545 +                               cpu_chill();
27546                         }
27547                 }
27548                 prb_close_block(pkc, pbd, po, status);
27549 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
27550 index 977f69886c00..f3e7a36b0396 100644
27551 --- a/net/rds/ib_rdma.c
27552 +++ b/net/rds/ib_rdma.c
27553 @@ -34,6 +34,7 @@
27554  #include <linux/slab.h>
27555  #include <linux/rculist.h>
27556  #include <linux/llist.h>
27557 +#include <linux/delay.h>
27558
27559  #include "rds_single_path.h"
27560  #include "ib_mr.h"
27561 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
27562         for_each_online_cpu(cpu) {
27563                 flag = &per_cpu(clean_list_grace, cpu);
27564                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
27565 -                       cpu_relax();
27566 +                       cpu_chill();
27567         }
27568  }
27569
27570 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
27571 index 7d921e56e715..13df56a738e5 100644
27572 --- a/net/rxrpc/security.c
27573 +++ b/net/rxrpc/security.c
27574 @@ -19,9 +19,6 @@
27575  #include <keys/rxrpc-type.h>
27576  #include "ar-internal.h"
27577
27578 -static LIST_HEAD(rxrpc_security_methods);
27579 -static DECLARE_RWSEM(rxrpc_security_sem);
27580 -
27581  static const struct rxrpc_security *rxrpc_security_types[] = {
27582         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
27583  #ifdef CONFIG_RXKAD
27584 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
27585 index ea13df1be067..76c20745b502 100644
27586 --- a/net/sched/sch_api.c
27587 +++ b/net/sched/sch_api.c
27588 @@ -980,7 +980,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
27589                         rcu_assign_pointer(sch->stab, stab);
27590                 }
27591                 if (tca[TCA_RATE]) {
27592 -                       seqcount_t *running;
27593 +                       net_seqlock_t *running;
27594
27595                         err = -EOPNOTSUPP;
27596                         if (sch->flags & TCQ_F_MQROOT)
27597 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
27598 index 9016c8baf2aa..d925f0e63679 100644
27599 --- a/net/sched/sch_generic.c
27600 +++ b/net/sched/sch_generic.c
27601 @@ -425,7 +425,11 @@ struct Qdisc noop_qdisc = {
27602         .ops            =       &noop_qdisc_ops,
27603         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
27604         .dev_queue      =       &noop_netdev_queue,
27605 +#ifdef CONFIG_PREEMPT_RT_BASE
27606 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
27607 +#else
27608         .running        =       SEQCNT_ZERO(noop_qdisc.running),
27609 +#endif
27610         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
27611  };
27612  EXPORT_SYMBOL(noop_qdisc);
27613 @@ -624,9 +628,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
27614         lockdep_set_class(&sch->busylock,
27615                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
27616
27617 +#ifdef CONFIG_PREEMPT_RT_BASE
27618 +       seqlock_init(&sch->running);
27619 +       lockdep_set_class(&sch->running.seqcount,
27620 +                         dev->qdisc_running_key ?: &qdisc_running_key);
27621 +       lockdep_set_class(&sch->running.lock,
27622 +                         dev->qdisc_running_key ?: &qdisc_running_key);
27623 +#else
27624         seqcount_init(&sch->running);
27625         lockdep_set_class(&sch->running,
27626                           dev->qdisc_running_key ?: &qdisc_running_key);
27627 +#endif
27628
27629         sch->ops = ops;
27630         sch->enqueue = ops->enqueue;
27631 @@ -926,7 +938,7 @@ void dev_deactivate_many(struct list_head *head)
27632         /* Wait for outstanding qdisc_run calls. */
27633         list_for_each_entry(dev, head, close_list)
27634                 while (some_qdisc_is_busy(dev))
27635 -                       yield();
27636 +                       msleep(1);
27637  }
27638
27639  void dev_deactivate(struct net_device *dev)
27640 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
27641 index 9c9db55a0c1e..e6583b018a72 100644
27642 --- a/net/sunrpc/svc_xprt.c
27643 +++ b/net/sunrpc/svc_xprt.c
27644 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27645                 goto out;
27646         }
27647
27648 -       cpu = get_cpu();
27649 +       cpu = get_cpu_light();
27650         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
27651
27652         atomic_long_inc(&pool->sp_stats.packets);
27653 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27654
27655                 atomic_long_inc(&pool->sp_stats.threads_woken);
27656                 wake_up_process(rqstp->rq_task);
27657 -               put_cpu();
27658 +               put_cpu_light();
27659                 goto out;
27660         }
27661         rcu_read_unlock();
27662 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27663                 goto redo_search;
27664         }
27665         rqstp = NULL;
27666 -       put_cpu();
27667 +       put_cpu_light();
27668  out:
27669         trace_svc_xprt_do_enqueue(xprt, rqstp);
27670  }
27671 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
27672 index 6fdc97ef6023..523e0420d7f0 100755
27673 --- a/scripts/mkcompile_h
27674 +++ b/scripts/mkcompile_h
27675 @@ -4,7 +4,8 @@ TARGET=$1
27676  ARCH=$2
27677  SMP=$3
27678  PREEMPT=$4
27679 -CC=$5
27680 +RT=$5
27681 +CC=$6
27682
27683  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
27684
27685 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
27686  CONFIG_FLAGS=""
27687  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
27688  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
27689 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
27690  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
27691
27692  # Truncate to maximum length
27693 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
27694 index 9d33c1e85c79..3d307bda86f9 100644
27695 --- a/sound/core/pcm_native.c
27696 +++ b/sound/core/pcm_native.c
27697 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
27698  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
27699  {
27700         if (!substream->pcm->nonatomic)
27701 -               local_irq_disable();
27702 +               local_irq_disable_nort();
27703         snd_pcm_stream_lock(substream);
27704  }
27705  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
27706 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
27707  {
27708         snd_pcm_stream_unlock(substream);
27709         if (!substream->pcm->nonatomic)
27710 -               local_irq_enable();
27711 +               local_irq_enable_nort();
27712  }
27713  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
27714
27715 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
27716  {
27717         unsigned long flags = 0;
27718         if (!substream->pcm->nonatomic)
27719 -               local_irq_save(flags);
27720 +               local_irq_save_nort(flags);
27721         snd_pcm_stream_lock(substream);
27722         return flags;
27723  }
27724 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
27725  {
27726         snd_pcm_stream_unlock(substream);
27727         if (!substream->pcm->nonatomic)
27728 -               local_irq_restore(flags);
27729 +               local_irq_restore_nort(flags);
27730  }
27731  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
27732