kernel-rt.patch

   1 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
   2 index 3a3b30ac2a75..9e0745cafbd8 100644
   3 --- a/Documentation/sysrq.txt
   4 +++ b/Documentation/sysrq.txt
   5 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
   6  On other - If you know of the key combos for other architectures, please
   7             let me know so I can add them to this section.
   8
   9 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
  10 -
  11 +On all -  write a character to /proc/sysrq-trigger, e.g.:
  12                 echo t > /proc/sysrq-trigger
  13
  14 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
  15 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
  16 +        Send an ICMP echo request with this pattern plus the particular
  17 +        SysRq command key. Example:
  18 +               # ping -c1 -s57 -p0102030468
  19 +        will trigger the SysRq-H (help) command.
  20 +
  21 +
  22  *  What are the 'command' keys?
  23  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  24  'b'     - Will immediately reboot the system without syncing or unmounting
  25 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
  26 new file mode 100644
  27 index 000000000000..6f2aeabf7faa
  28 --- /dev/null
  29 +++ b/Documentation/trace/histograms.txt
  30 @@ -0,0 +1,186 @@
  31 +               Using the Linux Kernel Latency Histograms
  32 +
  33 +
  34 +This document gives a short explanation how to enable, configure and use
  35 +latency histograms. Latency histograms are primarily relevant in the
  36 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
  37 +and are used in the quality management of the Linux real-time
  38 +capabilities.
  39 +
  40 +
  41 +* Purpose of latency histograms
  42 +
  43 +A latency histogram continuously accumulates the frequencies of latency
  44 +data. There are two types of histograms
  45 +- potential sources of latencies
  46 +- effective latencies
  47 +
  48 +
  49 +* Potential sources of latencies
  50 +
  51 +Potential sources of latencies are code segments where interrupts,
  52 +preemption or both are disabled (aka critical sections). To create
  53 +histograms of potential sources of latency, the kernel stores the time
  54 +stamp at the start of a critical section, determines the time elapsed
  55 +when the end of the section is reached, and increments the frequency
  56 +counter of that latency value - irrespective of whether any concurrently
  57 +running process is affected by latency or not.
  58 +- Configuration items (in the Kernel hacking/Tracers submenu)
  59 +  CONFIG_INTERRUPT_OFF_LATENCY
  60 +  CONFIG_PREEMPT_OFF_LATENCY
  61 +
  62 +
  63 +* Effective latencies
  64 +
  65 +Effective latencies are actually occuring during wakeup of a process. To
  66 +determine effective latencies, the kernel stores the time stamp when a
  67 +process is scheduled to be woken up, and determines the duration of the
  68 +wakeup time shortly before control is passed over to this process. Note
  69 +that the apparent latency in user space may be somewhat longer, since the
  70 +process may be interrupted after control is passed over to it but before
  71 +the execution in user space takes place. Simply measuring the interval
  72 +between enqueuing and wakeup may also not appropriate in cases when a
  73 +process is scheduled as a result of a timer expiration. The timer may have
  74 +missed its deadline, e.g. due to disabled interrupts, but this latency
  75 +would not be registered. Therefore, the offsets of missed timers are
  76 +recorded in a separate histogram. If both wakeup latency and missed timer
  77 +offsets are configured and enabled, a third histogram may be enabled that
  78 +records the overall latency as a sum of the timer latency, if any, and the
  79 +wakeup latency. This histogram is called "timerandwakeup".
  80 +- Configuration items (in the Kernel hacking/Tracers submenu)
  81 +  CONFIG_WAKEUP_LATENCY
  82 +  CONFIG_MISSED_TIMER_OFSETS
  83 +
  84 +
  85 +* Usage
  86 +
  87 +The interface to the administration of the latency histograms is located
  88 +in the debugfs file system. To mount it, either enter
  89 +
  90 +mount -t sysfs nodev /sys
  91 +mount -t debugfs nodev /sys/kernel/debug
  92 +
  93 +from shell command line level, or add
  94 +
  95 +nodev  /sys                    sysfs   defaults        0 0
  96 +nodev  /sys/kernel/debug       debugfs defaults        0 0
  97 +
  98 +to the file /etc/fstab. All latency histogram related files are then
  99 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
 100 +particular histogram type is enabled by writing non-zero to the related
 101 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
 102 +Select "preemptirqsoff" for the histograms of potential sources of
 103 +latencies and "wakeup" for histograms of effective latencies etc. The
 104 +histogram data - one per CPU - are available in the files
 105 +
 106 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
 107 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
 108 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
 109 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
 110 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
 111 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
 112 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
 113 +
 114 +The histograms are reset by writing non-zero to the file "reset" in a
 115 +particular latency directory. To reset all latency data, use
 116 +
 117 +#!/bin/sh
 118 +
 119 +TRACINGDIR=/sys/kernel/debug/tracing
 120 +HISTDIR=$TRACINGDIR/latency_hist
 121 +
 122 +if test -d $HISTDIR
 123 +then
 124 +  cd $HISTDIR
 125 +  for i in `find . | grep /reset$`
 126 +  do
 127 +    echo 1 >$i
 128 +  done
 129 +fi
 130 +
 131 +
 132 +* Data format
 133 +
 134 +Latency data are stored with a resolution of one microsecond. The
 135 +maximum latency is 10,240 microseconds. The data are only valid, if the
 136 +overflow register is empty. Every output line contains the latency in
 137 +microseconds in the first row and the number of samples in the second
 138 +row. To display only lines with a positive latency count, use, for
 139 +example,
 140 +
 141 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
 142 +
 143 +#Minimum latency: 0 microseconds.
 144 +#Average latency: 0 microseconds.
 145 +#Maximum latency: 25 microseconds.
 146 +#Total samples: 3104770694
 147 +#There are 0 samples greater or equal than 10240 microseconds
 148 +#usecs          samples
 149 +    0        2984486876
 150 +    1          49843506
 151 +    2          58219047
 152 +    3           5348126
 153 +    4           2187960
 154 +    5           3388262
 155 +    6            959289
 156 +    7            208294
 157 +    8             40420
 158 +    9              4485
 159 +   10             14918
 160 +   11             18340
 161 +   12             25052
 162 +   13             19455
 163 +   14              5602
 164 +   15               969
 165 +   16                47
 166 +   17                18
 167 +   18                14
 168 +   19                 1
 169 +   20                 3
 170 +   21                 2
 171 +   22                 5
 172 +   23                 2
 173 +   25                 1
 174 +
 175 +
 176 +* Wakeup latency of a selected process
 177 +
 178 +To only collect wakeup latency data of a particular process, write the
 179 +PID of the requested process to
 180 +
 181 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
 182 +
 183 +PIDs are not considered, if this variable is set to 0.
 184 +
 185 +
 186 +* Details of the process with the highest wakeup latency so far
 187 +
 188 +Selected data of the process that suffered from the highest wakeup
 189 +latency that occurred in a particular CPU are available in the file
 190 +
 191 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
 192 +
 193 +In addition, other relevant system data at the time when the
 194 +latency occurred are given.
 195 +
 196 +The format of the data is (all in one line):
 197 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
 198 +<- <PID> <Priority> <Command> <Timestamp>
 199 +
 200 +The value of <Timeroffset> is only relevant in the combined timer
 201 +and wakeup latency recording. In the wakeup recording, it is
 202 +always 0, in the missed_timer_offsets recording, it is the same
 203 +as <Latency>.
 204 +
 205 +When retrospectively searching for the origin of a latency and
 206 +tracing was not enabled, it may be helpful to know the name and
 207 +some basic data of the task that (finally) was switching to the
 208 +late real-tlme task. In addition to the victim's data, also the
 209 +data of the possible culprit are therefore displayed after the
 210 +"<-" symbol.
 211 +
 212 +Finally, the timestamp of the time when the latency occurred
 213 +in <seconds>.<microseconds> after the most recent system boot
 214 +is provided.
 215 +
 216 +These data are also reset when the wakeup histogram is reset.
 217 diff --git a/MAINTAINERS b/MAINTAINERS
 218 index 63cefa62324c..be0ea1e5c4cc 100644
 219 --- a/MAINTAINERS
 220 +++ b/MAINTAINERS
 221 @@ -5196,6 +5196,23 @@ F:       fs/fuse/
 222  F:     include/uapi/linux/fuse.h
 223  F:     Documentation/filesystems/fuse.txt
 224
 225 +FUTEX SUBSYSTEM
 226 +M:     Thomas Gleixner <tglx@linutronix.de>
 227 +M:     Ingo Molnar <mingo@redhat.com>
 228 +R:     Peter Zijlstra <peterz@infradead.org>
 229 +R:     Darren Hart <dvhart@infradead.org>
 230 +L:     linux-kernel@vger.kernel.org
 231 +T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
 232 +S:     Maintained
 233 +F:     kernel/futex.c
 234 +F:     kernel/futex_compat.c
 235 +F:     include/asm-generic/futex.h
 236 +F:     include/linux/futex.h
 237 +F:     include/uapi/linux/futex.h
 238 +F:     tools/testing/selftests/futex/
 239 +F:     tools/perf/bench/futex*
 240 +F:     Documentation/*futex*
 241 +
 242  FUTURE DOMAIN TMC-16x0 SCSI DRIVER (16-bit)
 243  M:     Rik Faith <faith@cs.unc.edu>
 244  L:     linux-scsi@vger.kernel.org
 245 diff --git a/arch/Kconfig b/arch/Kconfig
 246 index 659bdd079277..099fc0f5155e 100644
 247 --- a/arch/Kconfig
 248 +++ b/arch/Kconfig
 249 @@ -9,6 +9,7 @@ config OPROFILE
 250         tristate "OProfile system profiling"
 251         depends on PROFILING
 252         depends on HAVE_OPROFILE
 253 +       depends on !PREEMPT_RT_FULL
 254         select RING_BUFFER
 255         select RING_BUFFER_ALLOW_SWAP
 256         help
 257 @@ -52,6 +53,7 @@ config KPROBES
 258  config JUMP_LABEL
 259         bool "Optimize very unlikely/likely branches"
 260         depends on HAVE_ARCH_JUMP_LABEL
 261 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
 262         help
 263           This option enables a transparent branch optimization that
 264          makes certain almost-always-true or almost-always-false branch
 265 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 266 index b5d529fdffab..5715844e83e3 100644
 267 --- a/arch/arm/Kconfig
 268 +++ b/arch/arm/Kconfig
 269 @@ -36,7 +36,7 @@ config ARM
 270         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 271         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 272         select HAVE_ARCH_HARDENED_USERCOPY
 273 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
 274 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
 275         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 276         select HAVE_ARCH_MMAP_RND_BITS if MMU
 277         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 278 @@ -75,6 +75,7 @@ config ARM
 279         select HAVE_PERF_EVENTS
 280         select HAVE_PERF_REGS
 281         select HAVE_PERF_USER_STACK_DUMP
 282 +       select HAVE_PREEMPT_LAZY
 283         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 284         select HAVE_REGS_AND_STACK_ACCESS_API
 285         select HAVE_SYSCALL_TRACEPOINTS
 286 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
 287 index e53638c8ed8a..6095a1649865 100644
 288 --- a/arch/arm/include/asm/irq.h
 289 +++ b/arch/arm/include/asm/irq.h
 290 @@ -22,6 +22,8 @@
 291  #endif
 292
 293  #ifndef __ASSEMBLY__
 294 +#include <linux/cpumask.h>
 295 +
 296  struct irqaction;
 297  struct pt_regs;
 298  extern void migrate_irqs(void);
 299 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
 300 index 12ebfcc1d539..c962084605bc 100644
 301 --- a/arch/arm/include/asm/switch_to.h
 302 +++ b/arch/arm/include/asm/switch_to.h
 303 @@ -3,6 +3,13 @@
 304
 305  #include <linux/thread_info.h>
 306
 307 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
 308 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
 309 +#else
 310 +static inline void
 311 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
 312 +#endif
 313 +
 314  /*
 315   * For v7 SMP cores running a preemptible kernel we may be pre-empted
 316   * during a TLB maintenance operation, so execute an inner-shareable dsb
 317 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 318  #define switch_to(prev,next,last)                                      \
 319  do {                                                                   \
 320         __complete_pending_tlbi();                                      \
 321 +       switch_kmaps(prev, next);                                       \
 322         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
 323  } while (0)
 324
 325 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
 326 index 776757d1604a..1f36a4eccc72 100644
 327 --- a/arch/arm/include/asm/thread_info.h
 328 +++ b/arch/arm/include/asm/thread_info.h
 329 @@ -49,6 +49,7 @@ struct cpu_context_save {
 330  struct thread_info {
 331         unsigned long           flags;          /* low level flags */
 332         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 333 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 334         mm_segment_t            addr_limit;     /* address limit */
 335         struct task_struct      *task;          /* main task structure */
 336         __u32                   cpu;            /* cpu */
 337 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 338  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
 339  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
 340  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
 341 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
 342 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
 343 +#define TIF_NEED_RESCHED_LAZY  7
 344
 345  #define TIF_NOHZ               12      /* in adaptive nohz mode */
 346  #define TIF_USING_IWMMXT       17
 347 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 348  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 349  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 350  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 351 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 352  #define _TIF_UPROBE            (1 << TIF_UPROBE)
 353  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 354  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 355 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 356   * Change these and you break ASM code in entry-common.S
 357   */
 358  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 359 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
 360 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 361 +                                _TIF_NEED_RESCHED_LAZY)
 362
 363  #endif /* __KERNEL__ */
 364  #endif /* __ASM_ARM_THREAD_INFO_H */
 365 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
 366 index 608008229c7d..3866da3f7bb7 100644
 367 --- a/arch/arm/kernel/asm-offsets.c
 368 +++ b/arch/arm/kernel/asm-offsets.c
 369 @@ -65,6 +65,7 @@ int main(void)
 370    BLANK();
 371    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 372    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 373 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 374    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 375    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 376    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 377 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
 378 index 9f157e7c51e7..468e224d76aa 100644
 379 --- a/arch/arm/kernel/entry-armv.S
 380 +++ b/arch/arm/kernel/entry-armv.S
 381 @@ -220,11 +220,18 @@ ENDPROC(__dabt_svc)
 382
 383  #ifdef CONFIG_PREEMPT
 384         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 385 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 386         teq     r8, #0                          @ if preempt count != 0
 387 +       bne     1f                              @ return from exeption
 388 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 389 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 390 +       blne    svc_preempt                     @ preempt!
 391 +
 392 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 393 +       teq     r8, #0                          @ if preempt lazy count != 0
 394         movne   r0, #0                          @ force flags to 0
 395 -       tst     r0, #_TIF_NEED_RESCHED
 396 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 397         blne    svc_preempt
 398 +1:
 399  #endif
 400
 401         svc_exit r5, irq = 1                    @ return from exception
 402 @@ -239,8 +246,14 @@ ENDPROC(__irq_svc)
 403  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 404         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 405         tst     r0, #_TIF_NEED_RESCHED
 406 +       bne     1b
 407 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 408         reteq   r8                              @ go again
 409 -       b       1b
 410 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 411 +       teq     r0, #0                          @ if preempt lazy count != 0
 412 +       beq     1b
 413 +       ret     r8                              @ go again
 414 +
 415  #endif
 416
 417  __und_fault:
 418 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
 419 index 10c3283d6c19..8872937862cc 100644
 420 --- a/arch/arm/kernel/entry-common.S
 421 +++ b/arch/arm/kernel/entry-common.S
 422 @@ -36,7 +36,9 @@
 423   UNWIND(.cantunwind    )
 424         disable_irq_notrace                     @ disable interrupts
 425         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 426 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 427 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 428 +       bne     fast_work_pending
 429 +       tst     r1, #_TIF_SECCOMP
 430         bne     fast_work_pending
 431
 432         /* perform architecture specific actions before user return */
 433 @@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
 434         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 435         disable_irq_notrace                     @ disable interrupts
 436         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 437 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 438 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 439 +       bne     do_slower_path
 440 +       tst     r1, #_TIF_SECCOMP
 441         beq     no_work_pending
 442 +do_slower_path:
 443   UNWIND(.fnend         )
 444  ENDPROC(ret_fast_syscall)
 445
 446 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
 447 index 69bda1a5707e..1f665acaa6a9 100644
 448 --- a/arch/arm/kernel/patch.c
 449 +++ b/arch/arm/kernel/patch.c
 450 @@ -15,7 +15,7 @@ struct patch {
 451         unsigned int insn;
 452  };
 453
 454 -static DEFINE_SPINLOCK(patch_lock);
 455 +static DEFINE_RAW_SPINLOCK(patch_lock);
 456
 457  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 458         __acquires(&patch_lock)
 459 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 460                 return addr;
 461
 462         if (flags)
 463 -               spin_lock_irqsave(&patch_lock, *flags);
 464 +               raw_spin_lock_irqsave(&patch_lock, *flags);
 465         else
 466                 __acquire(&patch_lock);
 467
 468 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
 469         clear_fixmap(fixmap);
 470
 471         if (flags)
 472 -               spin_unlock_irqrestore(&patch_lock, *flags);
 473 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
 474         else
 475                 __release(&patch_lock);
 476  }
 477 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
 478 index 91d2d5b01414..750550098b59 100644
 479 --- a/arch/arm/kernel/process.c
 480 +++ b/arch/arm/kernel/process.c
 481 @@ -322,6 +322,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 482  }
 483
 484  #ifdef CONFIG_MMU
 485 +/*
 486 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 487 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 488 + * fail.
 489 + */
 490 +static int __init vectors_user_mapping_init_page(void)
 491 +{
 492 +       struct page *page;
 493 +       unsigned long addr = 0xffff0000;
 494 +       pgd_t *pgd;
 495 +       pud_t *pud;
 496 +       pmd_t *pmd;
 497 +
 498 +       pgd = pgd_offset_k(addr);
 499 +       pud = pud_offset(pgd, addr);
 500 +       pmd = pmd_offset(pud, addr);
 501 +       page = pmd_page(*(pmd));
 502 +
 503 +       pgtable_page_ctor(page);
 504 +
 505 +       return 0;
 506 +}
 507 +late_initcall(vectors_user_mapping_init_page);
 508 +
 509  #ifdef CONFIG_KUSER_HELPERS
 510  /*
 511   * The vectors page is always readable from user space for the
 512 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
 513 index 7b8f2141427b..96541e00b74a 100644
 514 --- a/arch/arm/kernel/signal.c
 515 +++ b/arch/arm/kernel/signal.c
 516 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 517          */
 518         trace_hardirqs_off();
 519         do {
 520 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 521 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 522 +                                          _TIF_NEED_RESCHED_LAZY))) {
 523                         schedule();
 524                 } else {
 525                         if (unlikely(!user_mode(regs)))
 526 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 527 index 7dd14e8395e6..4cd7e3d98035 100644
 528 --- a/arch/arm/kernel/smp.c
 529 +++ b/arch/arm/kernel/smp.c
 530 @@ -234,8 +234,6 @@ int __cpu_disable(void)
 531         flush_cache_louis();
 532         local_flush_tlb_all();
 533
 534 -       clear_tasks_mm_cpumask(cpu);
 535 -
 536         return 0;
 537  }
 538
 539 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
 540                 pr_err("CPU%u: cpu didn't die\n", cpu);
 541                 return;
 542         }
 543 +
 544 +       clear_tasks_mm_cpumask(cpu);
 545 +
 546         pr_notice("CPU%u: shutdown\n", cpu);
 547
 548         /*
 549 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
 550 index 0bee233fef9a..314cfb232a63 100644
 551 --- a/arch/arm/kernel/unwind.c
 552 +++ b/arch/arm/kernel/unwind.c
 553 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 554  static const struct unwind_idx *__origin_unwind_idx;
 555  extern const struct unwind_idx __stop_unwind_idx[];
 556
 557 -static DEFINE_SPINLOCK(unwind_lock);
 558 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 559  static LIST_HEAD(unwind_tables);
 560
 561  /* Convert a prel31 symbol to an absolute address */
 562 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 563                 /* module unwind tables */
 564                 struct unwind_table *table;
 565
 566 -               spin_lock_irqsave(&unwind_lock, flags);
 567 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 568                 list_for_each_entry(table, &unwind_tables, list) {
 569                         if (addr >= table->begin_addr &&
 570                             addr < table->end_addr) {
 571 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 572                                 break;
 573                         }
 574                 }
 575 -               spin_unlock_irqrestore(&unwind_lock, flags);
 576 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 577         }
 578
 579         pr_debug("%s: idx = %p\n", __func__, idx);
 580 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 581         tab->begin_addr = text_addr;
 582         tab->end_addr = text_addr + text_size;
 583
 584 -       spin_lock_irqsave(&unwind_lock, flags);
 585 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 586         list_add_tail(&tab->list, &unwind_tables);
 587 -       spin_unlock_irqrestore(&unwind_lock, flags);
 588 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 589
 590         return tab;
 591  }
 592 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 593         if (!tab)
 594                 return;
 595
 596 -       spin_lock_irqsave(&unwind_lock, flags);
 597 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 598         list_del(&tab->list);
 599 -       spin_unlock_irqrestore(&unwind_lock, flags);
 600 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 601
 602         kfree(tab);
 603  }
 604 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 605 index 19b5f5c1c0ff..82aa639e6737 100644
 606 --- a/arch/arm/kvm/arm.c
 607 +++ b/arch/arm/kvm/arm.c
 608 @@ -619,7 +619,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 609                  * involves poking the GIC, which must be done in a
 610                  * non-preemptible context.
 611                  */
 612 -               preempt_disable();
 613 +               migrate_disable();
 614                 kvm_pmu_flush_hwstate(vcpu);
 615                 kvm_timer_flush_hwstate(vcpu);
 616                 kvm_vgic_flush_hwstate(vcpu);
 617 @@ -640,7 +640,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 618                         kvm_pmu_sync_hwstate(vcpu);
 619                         kvm_timer_sync_hwstate(vcpu);
 620                         kvm_vgic_sync_hwstate(vcpu);
 621 -                       preempt_enable();
 622 +                       migrate_enable();
 623                         continue;
 624                 }
 625
 626 @@ -696,7 +696,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 627
 628                 kvm_vgic_sync_hwstate(vcpu);
 629
 630 -               preempt_enable();
 631 +               migrate_enable();
 632
 633                 ret = handle_exit(vcpu, run, ret);
 634         }
 635 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
 636 index 98ffe1e62ad5..df9769ddece5 100644
 637 --- a/arch/arm/mach-exynos/platsmp.c
 638 +++ b/arch/arm/mach-exynos/platsmp.c
 639 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
 640         return (void __iomem *)(S5P_VA_SCU);
 641  }
 642
 643 -static DEFINE_SPINLOCK(boot_lock);
 644 +static DEFINE_RAW_SPINLOCK(boot_lock);
 645
 646  static void exynos_secondary_init(unsigned int cpu)
 647  {
 648 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
 649         /*
 650          * Synchronise with the boot thread.
 651          */
 652 -       spin_lock(&boot_lock);
 653 -       spin_unlock(&boot_lock);
 654 +       raw_spin_lock(&boot_lock);
 655 +       raw_spin_unlock(&boot_lock);
 656  }
 657
 658  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 659 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 660          * Set synchronisation state between this boot processor
 661          * and the secondary one
 662          */
 663 -       spin_lock(&boot_lock);
 664 +       raw_spin_lock(&boot_lock);
 665
 666         /*
 667          * The secondary processor is waiting to be released from
 668 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 669
 670                 if (timeout == 0) {
 671                         printk(KERN_ERR "cpu1 power enable failed");
 672 -                       spin_unlock(&boot_lock);
 673 +                       raw_spin_unlock(&boot_lock);
 674                         return -ETIMEDOUT;
 675                 }
 676         }
 677 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 678          * calibrations, then wait for it to finish
 679          */
 680  fail:
 681 -       spin_unlock(&boot_lock);
 682 +       raw_spin_unlock(&boot_lock);
 683
 684         return pen_release != -1 ? ret : 0;
 685  }
 686 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
 687 index 4b653a8cb75c..b03d5a922cb1 100644
 688 --- a/arch/arm/mach-hisi/platmcpm.c
 689 +++ b/arch/arm/mach-hisi/platmcpm.c
 690 @@ -61,7 +61,7 @@
 691
 692  static void __iomem *sysctrl, *fabric;
 693  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
 694 -static DEFINE_SPINLOCK(boot_lock);
 695 +static DEFINE_RAW_SPINLOCK(boot_lock);
 696  static u32 fabric_phys_addr;
 697  /*
 698   * [0]: bootwrapper physical address
 699 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 700         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
 701                 return -EINVAL;
 702
 703 -       spin_lock_irq(&boot_lock);
 704 +       raw_spin_lock_irq(&boot_lock);
 705
 706         if (hip04_cpu_table[cluster][cpu])
 707                 goto out;
 708 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 709
 710  out:
 711         hip04_cpu_table[cluster][cpu]++;
 712 -       spin_unlock_irq(&boot_lock);
 713 +       raw_spin_unlock_irq(&boot_lock);
 714
 715         return 0;
 716  }
 717 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
 718         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 719         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 720
 721 -       spin_lock(&boot_lock);
 722 +       raw_spin_lock(&boot_lock);
 723         hip04_cpu_table[cluster][cpu]--;
 724         if (hip04_cpu_table[cluster][cpu] == 1) {
 725                 /* A power_up request went ahead of us. */
 726 -               spin_unlock(&boot_lock);
 727 +               raw_spin_unlock(&boot_lock);
 728                 return;
 729         } else if (hip04_cpu_table[cluster][cpu] > 1) {
 730                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
 731 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
 732         }
 733
 734         last_man = hip04_cluster_is_down(cluster);
 735 -       spin_unlock(&boot_lock);
 736 +       raw_spin_unlock(&boot_lock);
 737         if (last_man) {
 738                 /* Since it's Cortex A15, disable L2 prefetching. */
 739                 asm volatile(
 740 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 741                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 742
 743         count = TIMEOUT_MSEC / POLL_MSEC;
 744 -       spin_lock_irq(&boot_lock);
 745 +       raw_spin_lock_irq(&boot_lock);
 746         for (tries = 0; tries < count; tries++) {
 747                 if (hip04_cpu_table[cluster][cpu])
 748                         goto err;
 749 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 750                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 751                 if (data & CORE_WFI_STATUS(cpu))
 752                         break;
 753 -               spin_unlock_irq(&boot_lock);
 754 +               raw_spin_unlock_irq(&boot_lock);
 755                 /* Wait for clean L2 when the whole cluster is down. */
 756                 msleep(POLL_MSEC);
 757 -               spin_lock_irq(&boot_lock);
 758 +               raw_spin_lock_irq(&boot_lock);
 759         }
 760         if (tries >= count)
 761                 goto err;
 762 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 763                 goto err;
 764         if (hip04_cluster_is_down(cluster))
 765                 hip04_set_snoop_filter(cluster, 0);
 766 -       spin_unlock_irq(&boot_lock);
 767 +       raw_spin_unlock_irq(&boot_lock);
 768         return 1;
 769  err:
 770 -       spin_unlock_irq(&boot_lock);
 771 +       raw_spin_unlock_irq(&boot_lock);
 772         return 0;
 773  }
 774  #endif
 775 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
 776 index b4de3da6dffa..b52893319d75 100644
 777 --- a/arch/arm/mach-omap2/omap-smp.c
 778 +++ b/arch/arm/mach-omap2/omap-smp.c
 779 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
 780         .startup_addr = omap5_secondary_startup,
 781  };
 782
 783 -static DEFINE_SPINLOCK(boot_lock);
 784 +static DEFINE_RAW_SPINLOCK(boot_lock);
 785
 786  void __iomem *omap4_get_scu_base(void)
 787  {
 788 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
 789         /*
 790          * Synchronise with the boot thread.
 791          */
 792 -       spin_lock(&boot_lock);
 793 -       spin_unlock(&boot_lock);
 794 +       raw_spin_lock(&boot_lock);
 795 +       raw_spin_unlock(&boot_lock);
 796  }
 797
 798  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 799 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 800          * Set synchronisation state between this boot processor
 801          * and the secondary one
 802          */
 803 -       spin_lock(&boot_lock);
 804 +       raw_spin_lock(&boot_lock);
 805
 806         /*
 807          * Update the AuxCoreBoot0 with boot state for secondary core.
 808 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 809          * Now the secondary core is starting up let it run its
 810          * calibrations, then wait for it to finish
 811          */
 812 -       spin_unlock(&boot_lock);
 813 +       raw_spin_unlock(&boot_lock);
 814
 815         return 0;
 816  }
 817 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
 818 index 0875b99add18..18b6d98d2581 100644
 819 --- a/arch/arm/mach-prima2/platsmp.c
 820 +++ b/arch/arm/mach-prima2/platsmp.c
 821 @@ -22,7 +22,7 @@
 822
 823  static void __iomem *clk_base;
 824
 825 -static DEFINE_SPINLOCK(boot_lock);
 826 +static DEFINE_RAW_SPINLOCK(boot_lock);
 827
 828  static void sirfsoc_secondary_init(unsigned int cpu)
 829  {
 830 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
 831         /*
 832          * Synchronise with the boot thread.
 833          */
 834 -       spin_lock(&boot_lock);
 835 -       spin_unlock(&boot_lock);
 836 +       raw_spin_lock(&boot_lock);
 837 +       raw_spin_unlock(&boot_lock);
 838  }
 839
 840  static const struct of_device_id clk_ids[]  = {
 841 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 842         /* make sure write buffer is drained */
 843         mb();
 844
 845 -       spin_lock(&boot_lock);
 846 +       raw_spin_lock(&boot_lock);
 847
 848         /*
 849          * The secondary processor is waiting to be released from
 850 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 851          * now the secondary core is starting up let it run its
 852          * calibrations, then wait for it to finish
 853          */
 854 -       spin_unlock(&boot_lock);
 855 +       raw_spin_unlock(&boot_lock);
 856
 857         return pen_release != -1 ? -ENOSYS : 0;
 858  }
 859 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
 860 index 5494c9e0c909..e8ce157d3548 100644
 861 --- a/arch/arm/mach-qcom/platsmp.c
 862 +++ b/arch/arm/mach-qcom/platsmp.c
 863 @@ -46,7 +46,7 @@
 864
 865  extern void secondary_startup_arm(void);
 866
 867 -static DEFINE_SPINLOCK(boot_lock);
 868 +static DEFINE_RAW_SPINLOCK(boot_lock);
 869
 870  #ifdef CONFIG_HOTPLUG_CPU
 871  static void qcom_cpu_die(unsigned int cpu)
 872 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
 873         /*
 874          * Synchronise with the boot thread.
 875          */
 876 -       spin_lock(&boot_lock);
 877 -       spin_unlock(&boot_lock);
 878 +       raw_spin_lock(&boot_lock);
 879 +       raw_spin_unlock(&boot_lock);
 880  }
 881
 882  static int scss_release_secondary(unsigned int cpu)
 883 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 884          * set synchronisation state between this boot processor
 885          * and the secondary one
 886          */
 887 -       spin_lock(&boot_lock);
 888 +       raw_spin_lock(&boot_lock);
 889
 890         /*
 891          * Send the secondary CPU a soft interrupt, thereby causing
 892 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 893          * now the secondary core is starting up let it run its
 894          * calibrations, then wait for it to finish
 895          */
 896 -       spin_unlock(&boot_lock);
 897 +       raw_spin_unlock(&boot_lock);
 898
 899         return ret;
 900  }
 901 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
 902 index 8d1e2d551786..7fa56cc78118 100644
 903 --- a/arch/arm/mach-spear/platsmp.c
 904 +++ b/arch/arm/mach-spear/platsmp.c
 905 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
 906         sync_cache_w(&pen_release);
 907  }
 908
 909 -static DEFINE_SPINLOCK(boot_lock);
 910 +static DEFINE_RAW_SPINLOCK(boot_lock);
 911
 912  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 913
 914 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
 915         /*
 916          * Synchronise with the boot thread.
 917          */
 918 -       spin_lock(&boot_lock);
 919 -       spin_unlock(&boot_lock);
 920 +       raw_spin_lock(&boot_lock);
 921 +       raw_spin_unlock(&boot_lock);
 922  }
 923
 924  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 925 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 926          * set synchronisation state between this boot processor
 927          * and the secondary one
 928          */
 929 -       spin_lock(&boot_lock);
 930 +       raw_spin_lock(&boot_lock);
 931
 932         /*
 933          * The secondary processor is waiting to be released from
 934 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 935          * now the secondary core is starting up let it run its
 936          * calibrations, then wait for it to finish
 937          */
 938 -       spin_unlock(&boot_lock);
 939 +       raw_spin_unlock(&boot_lock);
 940
 941         return pen_release != -1 ? -ENOSYS : 0;
 942  }
 943 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
 944 index ea5a2277ee46..b988e081ac79 100644
 945 --- a/arch/arm/mach-sti/platsmp.c
 946 +++ b/arch/arm/mach-sti/platsmp.c
 947 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
 948         sync_cache_w(&pen_release);
 949  }
 950
 951 -static DEFINE_SPINLOCK(boot_lock);
 952 +static DEFINE_RAW_SPINLOCK(boot_lock);
 953
 954  static void sti_secondary_init(unsigned int cpu)
 955  {
 956 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
 957         /*
 958          * Synchronise with the boot thread.
 959          */
 960 -       spin_lock(&boot_lock);
 961 -       spin_unlock(&boot_lock);
 962 +       raw_spin_lock(&boot_lock);
 963 +       raw_spin_unlock(&boot_lock);
 964  }
 965
 966  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 967 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 968          * set synchronisation state between this boot processor
 969          * and the secondary one
 970          */
 971 -       spin_lock(&boot_lock);
 972 +       raw_spin_lock(&boot_lock);
 973
 974         /*
 975          * The secondary processor is waiting to be released from
 976 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 977          * now the secondary core is starting up let it run its
 978          * calibrations, then wait for it to finish
 979          */
 980 -       spin_unlock(&boot_lock);
 981 +       raw_spin_unlock(&boot_lock);
 982
 983         return pen_release != -1 ? -ENOSYS : 0;
 984  }
 985 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
 986 index 0122ad1a6027..926b1be48043 100644
 987 --- a/arch/arm/mm/fault.c
 988 +++ b/arch/arm/mm/fault.c
 989 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
 990         if (addr < TASK_SIZE)
 991                 return do_page_fault(addr, fsr, regs);
 992
 993 +       if (interrupts_enabled(regs))
 994 +               local_irq_enable();
 995 +
 996         if (user_mode(regs))
 997                 goto bad_area;
 998
 999 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1000  static int
1001  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1002  {
1003 +       if (interrupts_enabled(regs))
1004 +               local_irq_enable();
1005 +
1006         do_bad_area(addr, fsr, regs);
1007         return 0;
1008  }
1009 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1010 index d02f8187b1cc..542692dbd40a 100644
1011 --- a/arch/arm/mm/highmem.c
1012 +++ b/arch/arm/mm/highmem.c
1013 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1014         return *ptep;
1015  }
1016
1017 +static unsigned int fixmap_idx(int type)
1018 +{
1019 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1020 +}
1021 +
1022  void *kmap(struct page *page)
1023  {
1024         might_sleep();
1025 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1026
1027  void *kmap_atomic(struct page *page)
1028  {
1029 +       pte_t pte = mk_pte(page, kmap_prot);
1030         unsigned int idx;
1031         unsigned long vaddr;
1032         void *kmap;
1033         int type;
1034
1035 -       preempt_disable();
1036 +       preempt_disable_nort();
1037         pagefault_disable();
1038         if (!PageHighMem(page))
1039                 return page_address(page);
1040 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1041
1042         type = kmap_atomic_idx_push();
1043
1044 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1045 +       idx = fixmap_idx(type);
1046         vaddr = __fix_to_virt(idx);
1047  #ifdef CONFIG_DEBUG_HIGHMEM
1048         /*
1049 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1050          * in place, so the contained TLB flush ensures the TLB is updated
1051          * with the new mapping.
1052          */
1053 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1054 +#ifdef CONFIG_PREEMPT_RT_FULL
1055 +       current->kmap_pte[type] = pte;
1056 +#endif
1057 +       set_fixmap_pte(idx, pte);
1058
1059         return (void *)vaddr;
1060  }
1061 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1062
1063         if (kvaddr >= (void *)FIXADDR_START) {
1064                 type = kmap_atomic_idx();
1065 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1066 +               idx = fixmap_idx(type);
1067
1068                 if (cache_is_vivt())
1069                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1070 +#ifdef CONFIG_PREEMPT_RT_FULL
1071 +               current->kmap_pte[type] = __pte(0);
1072 +#endif
1073  #ifdef CONFIG_DEBUG_HIGHMEM
1074                 BUG_ON(vaddr != __fix_to_virt(idx));
1075 -               set_fixmap_pte(idx, __pte(0));
1076  #else
1077                 (void) idx;  /* to kill a warning */
1078  #endif
1079 +               set_fixmap_pte(idx, __pte(0));
1080                 kmap_atomic_idx_pop();
1081         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1082                 /* this address was obtained through kmap_high_get() */
1083                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1084         }
1085         pagefault_enable();
1086 -       preempt_enable();
1087 +       preempt_enable_nort();
1088  }
1089  EXPORT_SYMBOL(__kunmap_atomic);
1090
1091  void *kmap_atomic_pfn(unsigned long pfn)
1092  {
1093 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1094         unsigned long vaddr;
1095         int idx, type;
1096         struct page *page = pfn_to_page(pfn);
1097
1098 -       preempt_disable();
1099 +       preempt_disable_nort();
1100         pagefault_disable();
1101         if (!PageHighMem(page))
1102                 return page_address(page);
1103
1104         type = kmap_atomic_idx_push();
1105 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1106 +       idx = fixmap_idx(type);
1107         vaddr = __fix_to_virt(idx);
1108  #ifdef CONFIG_DEBUG_HIGHMEM
1109         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1110  #endif
1111 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1112 +#ifdef CONFIG_PREEMPT_RT_FULL
1113 +       current->kmap_pte[type] = pte;
1114 +#endif
1115 +       set_fixmap_pte(idx, pte);
1116
1117         return (void *)vaddr;
1118  }
1119 +#if defined CONFIG_PREEMPT_RT_FULL
1120 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1121 +{
1122 +       int i;
1123 +
1124 +       /*
1125 +        * Clear @prev's kmap_atomic mappings
1126 +        */
1127 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1128 +               int idx = fixmap_idx(i);
1129 +
1130 +               set_fixmap_pte(idx, __pte(0));
1131 +       }
1132 +       /*
1133 +        * Restore @next_p's kmap_atomic mappings
1134 +        */
1135 +       for (i = 0; i < next_p->kmap_idx; i++) {
1136 +               int idx = fixmap_idx(i);
1137 +
1138 +               if (!pte_none(next_p->kmap_pte[i]))
1139 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1140 +       }
1141 +}
1142 +#endif
1143 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1144 index c2366510187a..6b60f582b738 100644
1145 --- a/arch/arm/plat-versatile/platsmp.c
1146 +++ b/arch/arm/plat-versatile/platsmp.c
1147 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1148         sync_cache_w(&pen_release);
1149  }
1150
1151 -static DEFINE_SPINLOCK(boot_lock);
1152 +static DEFINE_RAW_SPINLOCK(boot_lock);
1153
1154  void versatile_secondary_init(unsigned int cpu)
1155  {
1156 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1157         /*
1158          * Synchronise with the boot thread.
1159          */
1160 -       spin_lock(&boot_lock);
1161 -       spin_unlock(&boot_lock);
1162 +       raw_spin_lock(&boot_lock);
1163 +       raw_spin_unlock(&boot_lock);
1164  }
1165
1166  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1167 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1168          * Set synchronisation state between this boot processor
1169          * and the secondary one
1170          */
1171 -       spin_lock(&boot_lock);
1172 +       raw_spin_lock(&boot_lock);
1173
1174         /*
1175          * This is really belt and braces; we hold unintended secondary
1176 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1177          * now the secondary core is starting up let it run its
1178          * calibrations, then wait for it to finish
1179          */
1180 -       spin_unlock(&boot_lock);
1181 +       raw_spin_unlock(&boot_lock);
1182
1183         return pen_release != -1 ? -ENOSYS : 0;
1184  }
1185 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1186 index cf57a7799a0f..78d1b49fbed5 100644
1187 --- a/arch/arm64/Kconfig
1188 +++ b/arch/arm64/Kconfig
1189 @@ -91,6 +91,7 @@ config ARM64
1190         select HAVE_PERF_EVENTS
1191         select HAVE_PERF_REGS
1192         select HAVE_PERF_USER_STACK_DUMP
1193 +       select HAVE_PREEMPT_LAZY
1194         select HAVE_REGS_AND_STACK_ACCESS_API
1195         select HAVE_RCU_TABLE_FREE
1196         select HAVE_SYSCALL_TRACEPOINTS
1197 @@ -704,7 +705,7 @@ config XEN_DOM0
1198
1199  config XEN
1200         bool "Xen guest support on ARM64"
1201 -       depends on ARM64 && OF
1202 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1203         select SWIOTLB_XEN
1204         select PARAVIRT
1205         help
1206 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1207 index e9ea5a6bd449..6c500ad63c6a 100644
1208 --- a/arch/arm64/include/asm/thread_info.h
1209 +++ b/arch/arm64/include/asm/thread_info.h
1210 @@ -49,6 +49,7 @@ struct thread_info {
1211         mm_segment_t            addr_limit;     /* address limit */
1212         struct task_struct      *task;          /* main task structure */
1213         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1214 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1215         int                     cpu;            /* cpu */
1216  };
1217
1218 @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
1219  #define TIF_NEED_RESCHED       1
1220  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1221  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1222 +#define TIF_NEED_RESCHED_LAZY  4
1223  #define TIF_NOHZ               7
1224  #define TIF_SYSCALL_TRACE      8
1225  #define TIF_SYSCALL_AUDIT      9
1226 @@ -127,6 +129,7 @@ static inline struct thread_info *current_thread_info(void)
1227  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1228  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1229  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1230 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1231  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1232  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1233  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1234 @@ -135,7 +138,9 @@ static inline struct thread_info *current_thread_info(void)
1235  #define _TIF_32BIT             (1 << TIF_32BIT)
1236
1237  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1238 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1239 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1240 +                                _TIF_NEED_RESCHED_LAZY)
1241 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1242
1243  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1244                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1245 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1246 index c58ddf8c4062..a8f2f7c1fe12 100644
1247 --- a/arch/arm64/kernel/asm-offsets.c
1248 +++ b/arch/arm64/kernel/asm-offsets.c
1249 @@ -38,6 +38,7 @@ int main(void)
1250    BLANK();
1251    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1252    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1253 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1254    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1255    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1256    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1257 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1258 index 79b0fe24d5b7..f3c959ade308 100644
1259 --- a/arch/arm64/kernel/entry.S
1260 +++ b/arch/arm64/kernel/entry.S
1261 @@ -428,11 +428,16 @@ ENDPROC(el1_sync)
1262
1263  #ifdef CONFIG_PREEMPT
1264         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1265 -       cbnz    w24, 1f                         // preempt count != 0
1266 +       cbnz    w24, 2f                         // preempt count != 0
1267         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1268 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1269 -       bl      el1_preempt
1270 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1271 +
1272 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1273 +       cbnz    w24, 2f                         // preempt lazy count != 0
1274 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1275  1:
1276 +       bl      el1_preempt
1277 +2:
1278  #endif
1279  #ifdef CONFIG_TRACE_IRQFLAGS
1280         bl      trace_hardirqs_on
1281 @@ -446,6 +451,7 @@ ENDPROC(el1_irq)
1282  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1283         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1284         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1285 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1286         ret     x24
1287  #endif
1288
1289 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
1290 index 404dd67080b9..639dc6d12e72 100644
1291 --- a/arch/arm64/kernel/signal.c
1292 +++ b/arch/arm64/kernel/signal.c
1293 @@ -409,7 +409,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
1294          */
1295         trace_hardirqs_off();
1296         do {
1297 -               if (thread_flags & _TIF_NEED_RESCHED) {
1298 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1299                         schedule();
1300                 } else {
1301                         local_irq_enable();
1302 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1303 index 5a4f2eb9d0d5..867eca2e7210 100644
1304 --- a/arch/mips/Kconfig
1305 +++ b/arch/mips/Kconfig
1306 @@ -2515,7 +2515,7 @@ config MIPS_ASID_BITS_VARIABLE
1307  #
1308  config HIGHMEM
1309         bool "High Memory Support"
1310 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1311 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1312
1313  config CPU_SUPPORTS_HIGHMEM
1314         bool
1315 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1316 index 65fba4c34cd7..4b5ba68910e0 100644
1317 --- a/arch/powerpc/Kconfig
1318 +++ b/arch/powerpc/Kconfig
1319 @@ -52,10 +52,11 @@ config LOCKDEP_SUPPORT
1320
1321  config RWSEM_GENERIC_SPINLOCK
1322         bool
1323 +       default y if PREEMPT_RT_FULL
1324
1325  config RWSEM_XCHGADD_ALGORITHM
1326         bool
1327 -       default y
1328 +       default y if !PREEMPT_RT_FULL
1329
1330  config GENERIC_LOCKBREAK
1331         bool
1332 @@ -134,6 +135,7 @@ config PPC
1333         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1334         select GENERIC_STRNCPY_FROM_USER
1335         select GENERIC_STRNLEN_USER
1336 +       select HAVE_PREEMPT_LAZY
1337         select HAVE_MOD_ARCH_SPECIFIC
1338         select MODULES_USE_ELF_RELA
1339         select CLONE_BACKWARDS
1340 @@ -321,7 +323,7 @@ menu "Kernel options"
1341
1342  config HIGHMEM
1343         bool "High memory support"
1344 -       depends on PPC32
1345 +       depends on PPC32 && !PREEMPT_RT_FULL
1346
1347  source kernel/Kconfig.hz
1348  source kernel/Kconfig.preempt
1349 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1350 index 87e4b2d8dcd4..981e501a4359 100644
1351 --- a/arch/powerpc/include/asm/thread_info.h
1352 +++ b/arch/powerpc/include/asm/thread_info.h
1353 @@ -43,6 +43,8 @@ struct thread_info {
1354         int             cpu;                    /* cpu we're on */
1355         int             preempt_count;          /* 0 => preemptable,
1356                                                    <0 => BUG */
1357 +       int             preempt_lazy_count;     /* 0 => preemptable,
1358 +                                                  <0 => BUG */
1359         unsigned long   local_flags;            /* private flags for thread */
1360  #ifdef CONFIG_LIVEPATCH
1361         unsigned long *livepatch_sp;
1362 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1363  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1364  #define TIF_SIGPENDING         1       /* signal pending */
1365  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1366 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1367 -                                          TIF_NEED_RESCHED */
1368 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1369  #define TIF_32BIT              4       /* 32 bit binary */
1370  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1371  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1372 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1373  #if defined(CONFIG_PPC64)
1374  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1375  #endif
1376 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1377 +                                          TIF_NEED_RESCHED */
1378
1379  /* as above, but as bit values */
1380  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1381 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1382  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1383  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1384  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1385 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1386  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1387                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1388                                  _TIF_NOHZ)
1389
1390  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1391                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1392 -                                _TIF_RESTORE_TM)
1393 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1394  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1395 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1396
1397  /* Bits in local_flags */
1398  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1399 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1400 index c833d88c423d..96e9fbc3f684 100644
1401 --- a/arch/powerpc/kernel/asm-offsets.c
1402 +++ b/arch/powerpc/kernel/asm-offsets.c
1403 @@ -156,6 +156,7 @@ int main(void)
1404         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1405         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1406         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1407 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1408         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1409         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1410
1411 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1412 index 3841d749a430..6dbaeff192b9 100644
1413 --- a/arch/powerpc/kernel/entry_32.S
1414 +++ b/arch/powerpc/kernel/entry_32.S
1415 @@ -835,7 +835,14 @@ user_exc_return:           /* r10 contains MSR_KERNEL here */
1416         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1417         bne     restore
1418         andi.   r8,r8,_TIF_NEED_RESCHED
1419 +       bne+    1f
1420 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1421 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1422 +       bne     restore
1423 +       lwz     r0,TI_FLAGS(r9)
1424 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1425         beq+    restore
1426 +1:
1427         lwz     r3,_MSR(r1)
1428         andi.   r0,r3,MSR_EE    /* interrupts off? */
1429         beq     restore         /* don't schedule if so */
1430 @@ -846,11 +853,11 @@ user_exc_return:          /* r10 contains MSR_KERNEL here */
1431          */
1432         bl      trace_hardirqs_off
1433  #endif
1434 -1:     bl      preempt_schedule_irq
1435 +2:     bl      preempt_schedule_irq
1436         CURRENT_THREAD_INFO(r9, r1)
1437         lwz     r3,TI_FLAGS(r9)
1438 -       andi.   r0,r3,_TIF_NEED_RESCHED
1439 -       bne-    1b
1440 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1441 +       bne-    2b
1442  #ifdef CONFIG_TRACE_IRQFLAGS
1443         /* And now, to properly rebalance the above, we tell lockdep they
1444          * are being turned back on, which will happen when we return
1445 @@ -1171,7 +1178,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
1446  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1447
1448  do_work:                       /* r10 contains MSR_KERNEL here */
1449 -       andi.   r0,r9,_TIF_NEED_RESCHED
1450 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1451         beq     do_user_signal
1452
1453  do_resched:                    /* r10 contains MSR_KERNEL here */
1454 @@ -1192,7 +1199,7 @@ do_resched:                       /* r10 contains MSR_KERNEL here */
1455         MTMSRD(r10)             /* disable interrupts */
1456         CURRENT_THREAD_INFO(r9, r1)
1457         lwz     r9,TI_FLAGS(r9)
1458 -       andi.   r0,r9,_TIF_NEED_RESCHED
1459 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1460         bne-    do_resched
1461         andi.   r0,r9,_TIF_USER_WORK_MASK
1462         beq     restore_user
1463 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1464 index 767ef6d68c9e..2cb4d5552319 100644
1465 --- a/arch/powerpc/kernel/entry_64.S
1466 +++ b/arch/powerpc/kernel/entry_64.S
1467 @@ -656,7 +656,7 @@ _GLOBAL(ret_from_except_lite)
1468         bl      restore_math
1469         b       restore
1470  #endif
1471 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1472 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1473         beq     2f
1474         bl      restore_interrupts
1475         SCHEDULE_USER
1476 @@ -718,10 +718,18 @@ _GLOBAL(ret_from_except_lite)
1477
1478  #ifdef CONFIG_PREEMPT
1479         /* Check if we need to preempt */
1480 -       andi.   r0,r4,_TIF_NEED_RESCHED
1481 -       beq+    restore
1482 -       /* Check that preempt_count() == 0 and interrupts are enabled */
1483         lwz     r8,TI_PREEMPT(r9)
1484 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1485 +       bne     restore
1486 +       andi.   r0,r4,_TIF_NEED_RESCHED
1487 +       bne+    check_count
1488 +
1489 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1490 +       beq+    restore
1491 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1492 +
1493 +       /* Check that preempt_count() == 0 and interrupts are enabled */
1494 +check_count:
1495         cmpwi   cr1,r8,0
1496         ld      r0,SOFTE(r1)
1497         cmpdi   r0,0
1498 @@ -738,7 +746,7 @@ _GLOBAL(ret_from_except_lite)
1499         /* Re-test flags and eventually loop */
1500         CURRENT_THREAD_INFO(r9, r1)
1501         ld      r4,TI_FLAGS(r9)
1502 -       andi.   r0,r4,_TIF_NEED_RESCHED
1503 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1504         bne     1b
1505
1506         /*
1507 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1508 index 3c05c311e35e..f83f6ac1274d 100644
1509 --- a/arch/powerpc/kernel/irq.c
1510 +++ b/arch/powerpc/kernel/irq.c
1511 @@ -638,6 +638,7 @@ void irq_ctx_init(void)
1512         }
1513  }
1514
1515 +#ifndef CONFIG_PREEMPT_RT_FULL
1516  void do_softirq_own_stack(void)
1517  {
1518         struct thread_info *curtp, *irqtp;
1519 @@ -655,6 +656,7 @@ void do_softirq_own_stack(void)
1520         if (irqtp->flags)
1521                 set_bits(irqtp->flags, &curtp->flags);
1522  }
1523 +#endif
1524
1525  irq_hw_number_t virq_to_hw(unsigned int virq)
1526  {
1527 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1528 index 030d72df5dd5..b471a709e100 100644
1529 --- a/arch/powerpc/kernel/misc_32.S
1530 +++ b/arch/powerpc/kernel/misc_32.S
1531 @@ -41,6 +41,7 @@
1532   * We store the saved ksp_limit in the unused part
1533   * of the STACK_FRAME_OVERHEAD
1534   */
1535 +#ifndef CONFIG_PREEMPT_RT_FULL
1536  _GLOBAL(call_do_softirq)
1537         mflr    r0
1538         stw     r0,4(r1)
1539 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
1540         stw     r10,THREAD+KSP_LIMIT(r2)
1541         mtlr    r0
1542         blr
1543 +#endif
1544
1545  /*
1546   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1547 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1548 index 4cefe6888b18..cb2ee4be999a 100644
1549 --- a/arch/powerpc/kernel/misc_64.S
1550 +++ b/arch/powerpc/kernel/misc_64.S
1551 @@ -31,6 +31,7 @@
1552
1553         .text
1554
1555 +#ifndef CONFIG_PREEMPT_RT_FULL
1556  _GLOBAL(call_do_softirq)
1557         mflr    r0
1558         std     r0,16(r1)
1559 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
1560         ld      r0,16(r1)
1561         mtlr    r0
1562         blr
1563 +#endif
1564
1565  _GLOBAL(call_do_irq)
1566         mflr    r0
1567 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1568 index 029be26b5a17..9528089ea142 100644
1569 --- a/arch/powerpc/kvm/Kconfig
1570 +++ b/arch/powerpc/kvm/Kconfig
1571 @@ -175,6 +175,7 @@ config KVM_E500MC
1572  config KVM_MPIC
1573         bool "KVM in-kernel MPIC emulation"
1574         depends on KVM && E500
1575 +       depends on !PREEMPT_RT_FULL
1576         select HAVE_KVM_IRQCHIP
1577         select HAVE_KVM_IRQFD
1578         select HAVE_KVM_IRQ_ROUTING
1579 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1580 index e48462447ff0..2670cee66064 100644
1581 --- a/arch/powerpc/platforms/ps3/device-init.c
1582 +++ b/arch/powerpc/platforms/ps3/device-init.c
1583 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1584         }
1585         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1586
1587 -       res = wait_event_interruptible(dev->done.wait,
1588 +       res = swait_event_interruptible(dev->done.wait,
1589                                        dev->done.done || kthread_should_stop());
1590         if (kthread_should_stop())
1591                 res = -EINTR;
1592 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1593 index 6c0378c0b8b5..abd58b4dff97 100644
1594 --- a/arch/sh/kernel/irq.c
1595 +++ b/arch/sh/kernel/irq.c
1596 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1597         hardirq_ctx[cpu] = NULL;
1598  }
1599
1600 +#ifndef CONFIG_PREEMPT_RT_FULL
1601  void do_softirq_own_stack(void)
1602  {
1603         struct thread_info *curctx;
1604 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1605                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1606         );
1607  }
1608 +#endif
1609  #else
1610  static inline void handle_one_irq(unsigned int irq)
1611  {
1612 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1613 index 165ecdd24d22..b68a464a22be 100644
1614 --- a/arch/sparc/Kconfig
1615 +++ b/arch/sparc/Kconfig
1616 @@ -194,12 +194,10 @@ config NR_CPUS
1617  source kernel/Kconfig.hz
1618
1619  config RWSEM_GENERIC_SPINLOCK
1620 -       bool
1621 -       default y if SPARC32
1622 +       def_bool PREEMPT_RT_FULL
1623
1624  config RWSEM_XCHGADD_ALGORITHM
1625 -       bool
1626 -       default y if SPARC64
1627 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1628
1629  config GENERIC_HWEIGHT
1630         bool
1631 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1632 index 34a7930b76ef..773740521008 100644
1633 --- a/arch/sparc/kernel/irq_64.c
1634 +++ b/arch/sparc/kernel/irq_64.c
1635 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1636         set_irq_regs(old_regs);
1637  }
1638
1639 +#ifndef CONFIG_PREEMPT_RT_FULL
1640  void do_softirq_own_stack(void)
1641  {
1642         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1643 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1644         __asm__ __volatile__("mov %0, %%sp"
1645                              : : "r" (orig_sp));
1646  }
1647 +#endif
1648
1649  #ifdef CONFIG_HOTPLUG_CPU
1650  void fixup_irqs(void)
1651 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1652 index bada636d1065..f8a995c90c01 100644
1653 --- a/arch/x86/Kconfig
1654 +++ b/arch/x86/Kconfig
1655 @@ -17,6 +17,7 @@ config X86_64
1656  ### Arch settings
1657  config X86
1658         def_bool y
1659 +       select HAVE_PREEMPT_LAZY
1660         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1661         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1662         select ANON_INODES
1663 @@ -232,8 +233,11 @@ config ARCH_MAY_HAVE_PC_FDC
1664         def_bool y
1665         depends on ISA_DMA_API
1666
1667 +config RWSEM_GENERIC_SPINLOCK
1668 +       def_bool PREEMPT_RT_FULL
1669 +
1670  config RWSEM_XCHGADD_ALGORITHM
1671 -       def_bool y
1672 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1673
1674  config GENERIC_CALIBRATE_DELAY
1675         def_bool y
1676 @@ -897,7 +901,7 @@ config IOMMU_HELPER
1677  config MAXSMP
1678         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1679         depends on X86_64 && SMP && DEBUG_KERNEL
1680 -       select CPUMASK_OFFSTACK
1681 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1682         ---help---
1683           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1684           If unsure, say N.
1685 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1686 index aa8b0672f87a..2429414bfc71 100644
1687 --- a/arch/x86/crypto/aesni-intel_glue.c
1688 +++ b/arch/x86/crypto/aesni-intel_glue.c
1689 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1690         err = blkcipher_walk_virt(desc, &walk);
1691         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1692
1693 -       kernel_fpu_begin();
1694         while ((nbytes = walk.nbytes)) {
1695 +               kernel_fpu_begin();
1696                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1697 -                             nbytes & AES_BLOCK_MASK);
1698 +                               nbytes & AES_BLOCK_MASK);
1699 +               kernel_fpu_end();
1700                 nbytes &= AES_BLOCK_SIZE - 1;
1701                 err = blkcipher_walk_done(desc, &walk, nbytes);
1702         }
1703 -       kernel_fpu_end();
1704
1705         return err;
1706  }
1707 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1708         err = blkcipher_walk_virt(desc, &walk);
1709         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1710
1711 -       kernel_fpu_begin();
1712         while ((nbytes = walk.nbytes)) {
1713 +               kernel_fpu_begin();
1714                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1715                               nbytes & AES_BLOCK_MASK);
1716 +               kernel_fpu_end();
1717                 nbytes &= AES_BLOCK_SIZE - 1;
1718                 err = blkcipher_walk_done(desc, &walk, nbytes);
1719         }
1720 -       kernel_fpu_end();
1721
1722         return err;
1723  }
1724 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1725         err = blkcipher_walk_virt(desc, &walk);
1726         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1727
1728 -       kernel_fpu_begin();
1729         while ((nbytes = walk.nbytes)) {
1730 +               kernel_fpu_begin();
1731                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1732                               nbytes & AES_BLOCK_MASK, walk.iv);
1733 +               kernel_fpu_end();
1734                 nbytes &= AES_BLOCK_SIZE - 1;
1735                 err = blkcipher_walk_done(desc, &walk, nbytes);
1736         }
1737 -       kernel_fpu_end();
1738
1739         return err;
1740  }
1741 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1742         err = blkcipher_walk_virt(desc, &walk);
1743         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1744
1745 -       kernel_fpu_begin();
1746         while ((nbytes = walk.nbytes)) {
1747 +               kernel_fpu_begin();
1748                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1749                               nbytes & AES_BLOCK_MASK, walk.iv);
1750 +               kernel_fpu_end();
1751                 nbytes &= AES_BLOCK_SIZE - 1;
1752                 err = blkcipher_walk_done(desc, &walk, nbytes);
1753         }
1754 -       kernel_fpu_end();
1755
1756         return err;
1757  }
1758 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1759         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1760         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1761
1762 -       kernel_fpu_begin();
1763         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1764 +               kernel_fpu_begin();
1765                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1766                                       nbytes & AES_BLOCK_MASK, walk.iv);
1767 +               kernel_fpu_end();
1768                 nbytes &= AES_BLOCK_SIZE - 1;
1769                 err = blkcipher_walk_done(desc, &walk, nbytes);
1770         }
1771         if (walk.nbytes) {
1772 +               kernel_fpu_begin();
1773                 ctr_crypt_final(ctx, &walk);
1774 +               kernel_fpu_end();
1775                 err = blkcipher_walk_done(desc, &walk, 0);
1776         }
1777 -       kernel_fpu_end();
1778
1779         return err;
1780  }
1781 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1782 index 8648158f3916..d7699130ee36 100644
1783 --- a/arch/x86/crypto/cast5_avx_glue.c
1784 +++ b/arch/x86/crypto/cast5_avx_glue.c
1785 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1786  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1787                      bool enc)
1788  {
1789 -       bool fpu_enabled = false;
1790 +       bool fpu_enabled;
1791         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1792         const unsigned int bsize = CAST5_BLOCK_SIZE;
1793         unsigned int nbytes;
1794 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1795                 u8 *wsrc = walk->src.virt.addr;
1796                 u8 *wdst = walk->dst.virt.addr;
1797
1798 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1799 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1800
1801                 /* Process multi-block batch */
1802                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1803 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1804                 } while (nbytes >= bsize);
1805
1806  done:
1807 +               cast5_fpu_end(fpu_enabled);
1808                 err = blkcipher_walk_done(desc, walk, nbytes);
1809         }
1810 -
1811 -       cast5_fpu_end(fpu_enabled);
1812         return err;
1813  }
1814
1815 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1816  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1817                        struct scatterlist *src, unsigned int nbytes)
1818  {
1819 -       bool fpu_enabled = false;
1820 +       bool fpu_enabled;
1821         struct blkcipher_walk walk;
1822         int err;
1823
1824 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1825         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1826
1827         while ((nbytes = walk.nbytes)) {
1828 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1829 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1830                 nbytes = __cbc_decrypt(desc, &walk);
1831 +               cast5_fpu_end(fpu_enabled);
1832                 err = blkcipher_walk_done(desc, &walk, nbytes);
1833         }
1834 -
1835 -       cast5_fpu_end(fpu_enabled);
1836         return err;
1837  }
1838
1839 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1840  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1841                      struct scatterlist *src, unsigned int nbytes)
1842  {
1843 -       bool fpu_enabled = false;
1844 +       bool fpu_enabled;
1845         struct blkcipher_walk walk;
1846         int err;
1847
1848 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1849         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1850
1851         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1852 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1853 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1854                 nbytes = __ctr_crypt(desc, &walk);
1855 +               cast5_fpu_end(fpu_enabled);
1856                 err = blkcipher_walk_done(desc, &walk, nbytes);
1857         }
1858
1859 -       cast5_fpu_end(fpu_enabled);
1860 -
1861         if (walk.nbytes) {
1862                 ctr_crypt_final(desc, &walk);
1863                 err = blkcipher_walk_done(desc, &walk, 0);
1864 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1865 index 6a85598931b5..3a506ce7ed93 100644
1866 --- a/arch/x86/crypto/glue_helper.c
1867 +++ b/arch/x86/crypto/glue_helper.c
1868 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1869         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1870         const unsigned int bsize = 128 / 8;
1871         unsigned int nbytes, i, func_bytes;
1872 -       bool fpu_enabled = false;
1873 +       bool fpu_enabled;
1874         int err;
1875
1876         err = blkcipher_walk_virt(desc, walk);
1877 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1878                 u8 *wdst = walk->dst.virt.addr;
1879
1880                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1881 -                                            desc, fpu_enabled, nbytes);
1882 +                                            desc, false, nbytes);
1883
1884                 for (i = 0; i < gctx->num_funcs; i++) {
1885                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1886 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1887                 }
1888
1889  done:
1890 +               glue_fpu_end(fpu_enabled);
1891                 err = blkcipher_walk_done(desc, walk, nbytes);
1892         }
1893
1894 -       glue_fpu_end(fpu_enabled);
1895         return err;
1896  }
1897
1898 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1899                             struct scatterlist *src, unsigned int nbytes)
1900  {
1901         const unsigned int bsize = 128 / 8;
1902 -       bool fpu_enabled = false;
1903 +       bool fpu_enabled;
1904         struct blkcipher_walk walk;
1905         int err;
1906
1907 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1908
1909         while ((nbytes = walk.nbytes)) {
1910                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1911 -                                            desc, fpu_enabled, nbytes);
1912 +                                            desc, false, nbytes);
1913                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1914 +               glue_fpu_end(fpu_enabled);
1915                 err = blkcipher_walk_done(desc, &walk, nbytes);
1916         }
1917
1918 -       glue_fpu_end(fpu_enabled);
1919         return err;
1920  }
1921  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1922 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1923                           struct scatterlist *src, unsigned int nbytes)
1924  {
1925         const unsigned int bsize = 128 / 8;
1926 -       bool fpu_enabled = false;
1927 +       bool fpu_enabled;
1928         struct blkcipher_walk walk;
1929         int err;
1930
1931 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1932
1933         while ((nbytes = walk.nbytes) >= bsize) {
1934                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1935 -                                            desc, fpu_enabled, nbytes);
1936 +                                            desc, false, nbytes);
1937                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1938 +               glue_fpu_end(fpu_enabled);
1939                 err = blkcipher_walk_done(desc, &walk, nbytes);
1940         }
1941
1942 -       glue_fpu_end(fpu_enabled);
1943 -
1944         if (walk.nbytes) {
1945                 glue_ctr_crypt_final_128bit(
1946                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1947 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1948                           void *tweak_ctx, void *crypt_ctx)
1949  {
1950         const unsigned int bsize = 128 / 8;
1951 -       bool fpu_enabled = false;
1952 +       bool fpu_enabled;
1953         struct blkcipher_walk walk;
1954         int err;
1955
1956 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1957
1958         /* set minimum length to bsize, for tweak_fn */
1959         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1960 -                                    desc, fpu_enabled,
1961 +                                    desc, false,
1962                                      nbytes < bsize ? bsize : nbytes);
1963 -
1964         /* calculate first value of T */
1965         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1966 +       glue_fpu_end(fpu_enabled);
1967
1968         while (nbytes) {
1969 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1970 +                               desc, false, nbytes);
1971                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1972
1973 +               glue_fpu_end(fpu_enabled);
1974                 err = blkcipher_walk_done(desc, &walk, nbytes);
1975                 nbytes = walk.nbytes;
1976         }
1977 -
1978 -       glue_fpu_end(fpu_enabled);
1979 -
1980         return err;
1981  }
1982  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1983 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1984 index bdd9cc59d20f..56d01a339ba4 100644
1985 --- a/arch/x86/entry/common.c
1986 +++ b/arch/x86/entry/common.c
1987 @@ -129,7 +129,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1988
1989  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1990         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1991 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1992 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1993
1994  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1995  {
1996 @@ -145,9 +145,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1997                 /* We have work to do. */
1998                 local_irq_enable();
1999
2000 -               if (cached_flags & _TIF_NEED_RESCHED)
2001 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2002                         schedule();
2003
2004 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2005 +               if (unlikely(current->forced_info.si_signo)) {
2006 +                       struct task_struct *t = current;
2007 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2008 +                       t->forced_info.si_signo = 0;
2009 +               }
2010 +#endif
2011                 if (cached_flags & _TIF_UPROBE)
2012                         uprobe_notify_resume(regs);
2013
2014 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2015 index edba8606b99a..4a3389535fc6 100644
2016 --- a/arch/x86/entry/entry_32.S
2017 +++ b/arch/x86/entry/entry_32.S
2018 @@ -308,8 +308,25 @@ END(ret_from_exception)
2019  ENTRY(resume_kernel)
2020         DISABLE_INTERRUPTS(CLBR_ANY)
2021  need_resched:
2022 +       # preempt count == 0 + NEED_RS set?
2023         cmpl    $0, PER_CPU_VAR(__preempt_count)
2024 +#ifndef CONFIG_PREEMPT_LAZY
2025         jnz     restore_all
2026 +#else
2027 +       jz test_int_off
2028 +
2029 +       # atleast preempt count == 0 ?
2030 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2031 +       jne restore_all
2032 +
2033 +       movl    PER_CPU_VAR(current_task), %ebp
2034 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
2035 +       jnz restore_all
2036 +
2037 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2038 +       jz restore_all
2039 +test_int_off:
2040 +#endif
2041         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2042         jz      restore_all
2043         call    preempt_schedule_irq
2044 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2045 index ef766a358b37..28401f826ab1 100644
2046 --- a/arch/x86/entry/entry_64.S
2047 +++ b/arch/x86/entry/entry_64.S
2048 @@ -546,7 +546,23 @@ GLOBAL(retint_user)
2049         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2050         jnc     1f
2051  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2052 +#ifndef CONFIG_PREEMPT_LAZY
2053         jnz     1f
2054 +#else
2055 +       jz      do_preempt_schedule_irq
2056 +
2057 +       # atleast preempt count == 0 ?
2058 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2059 +       jnz     1f
2060 +
2061 +       movq    PER_CPU_VAR(current_task), %rcx
2062 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2063 +       jnz     1f
2064 +
2065 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2066 +       jnc     1f
2067 +do_preempt_schedule_irq:
2068 +#endif
2069         call    preempt_schedule_irq
2070         jmp     0b
2071  1:
2072 @@ -894,6 +910,7 @@ EXPORT_SYMBOL(native_load_gs_index)
2073         jmp     2b
2074         .previous
2075
2076 +#ifndef CONFIG_PREEMPT_RT_FULL
2077  /* Call softirq on interrupt stack. Interrupts are off. */
2078  ENTRY(do_softirq_own_stack)
2079         pushq   %rbp
2080 @@ -906,6 +923,7 @@ ENTRY(do_softirq_own_stack)
2081         decl    PER_CPU_VAR(irq_count)
2082         ret
2083  END(do_softirq_own_stack)
2084 +#endif
2085
2086  #ifdef CONFIG_XEN
2087  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2088 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2089 index 17f218645701..11bd1b7ee6eb 100644
2090 --- a/arch/x86/include/asm/preempt.h
2091 +++ b/arch/x86/include/asm/preempt.h
2092 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2093   * a decrement which hits zero means we have no preempt_count and should
2094   * reschedule.
2095   */
2096 -static __always_inline bool __preempt_count_dec_and_test(void)
2097 +static __always_inline bool ____preempt_count_dec_and_test(void)
2098  {
2099         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2100  }
2101
2102 +static __always_inline bool __preempt_count_dec_and_test(void)
2103 +{
2104 +       if (____preempt_count_dec_and_test())
2105 +               return true;
2106 +#ifdef CONFIG_PREEMPT_LAZY
2107 +       if (current_thread_info()->preempt_lazy_count)
2108 +               return false;
2109 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2110 +#else
2111 +       return false;
2112 +#endif
2113 +}
2114 +
2115  /*
2116   * Returns true when we need to resched and can (barring IRQ state).
2117   */
2118  static __always_inline bool should_resched(int preempt_offset)
2119  {
2120 +#ifdef CONFIG_PREEMPT_LAZY
2121 +       u32 tmp;
2122 +
2123 +       tmp = raw_cpu_read_4(__preempt_count);
2124 +       if (tmp == preempt_offset)
2125 +               return true;
2126 +
2127 +       /* preempt count == 0 ? */
2128 +       tmp &= ~PREEMPT_NEED_RESCHED;
2129 +       if (tmp)
2130 +               return false;
2131 +       if (current_thread_info()->preempt_lazy_count)
2132 +               return false;
2133 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2134 +#else
2135         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2136 +#endif
2137  }
2138
2139  #ifdef CONFIG_PREEMPT
2140 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2141 index 8af22be0fe61..d1328789b759 100644
2142 --- a/arch/x86/include/asm/signal.h
2143 +++ b/arch/x86/include/asm/signal.h
2144 @@ -27,6 +27,19 @@ typedef struct {
2145  #define SA_IA32_ABI    0x02000000u
2146  #define SA_X32_ABI     0x01000000u
2147
2148 +/*
2149 + * Because some traps use the IST stack, we must keep preemption
2150 + * disabled while calling do_trap(), but do_trap() may call
2151 + * force_sig_info() which will grab the signal spin_locks for the
2152 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2153 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2154 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2155 + * trap.
2156 + */
2157 +#if defined(CONFIG_PREEMPT_RT_FULL)
2158 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2159 +#endif
2160 +
2161  #ifndef CONFIG_COMPAT
2162  typedef sigset_t compat_sigset_t;
2163  #endif
2164 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2165 index 58505f01962f..02fa39652cd6 100644
2166 --- a/arch/x86/include/asm/stackprotector.h
2167 +++ b/arch/x86/include/asm/stackprotector.h
2168 @@ -59,7 +59,7 @@
2169   */
2170  static __always_inline void boot_init_stack_canary(void)
2171  {
2172 -       u64 canary;
2173 +       u64 uninitialized_var(canary);
2174         u64 tsc;
2175
2176  #ifdef CONFIG_X86_64
2177 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2178          * of randomness. The TSC only matters for very early init,
2179          * there it already has some randomness on most systems. Later
2180          * on during the bootup the random pool has true entropy too.
2181 +        *
2182 +        * For preempt-rt we need to weaken the randomness a bit, as
2183 +        * we can't call into the random generator from atomic context
2184 +        * due to locking constraints. We just leave canary
2185 +        * uninitialized and use the TSC based randomness on top of it.
2186          */
2187 +#ifndef CONFIG_PREEMPT_RT_FULL
2188         get_random_bytes(&canary, sizeof(canary));
2189 +#endif
2190         tsc = rdtsc();
2191         canary += tsc + (tsc << 32UL);
2192
2193 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2194 index ad6f5eb07a95..5ceb3a1c2b1a 100644
2195 --- a/arch/x86/include/asm/thread_info.h
2196 +++ b/arch/x86/include/asm/thread_info.h
2197 @@ -54,11 +54,14 @@ struct task_struct;
2198
2199  struct thread_info {
2200         unsigned long           flags;          /* low level flags */
2201 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2202 +                                                          <0 => BUG */
2203  };
2204
2205  #define INIT_THREAD_INFO(tsk)                  \
2206  {                                              \
2207         .flags          = 0,                    \
2208 +       .preempt_lazy_count = 0,                \
2209  }
2210
2211  #define init_stack             (init_thread_union.stack)
2212 @@ -67,6 +70,10 @@ struct thread_info {
2213
2214  #include <asm/asm-offsets.h>
2215
2216 +#define GET_THREAD_INFO(reg) \
2217 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2218 +       _ASM_SUB $(THREAD_SIZE),reg ;
2219 +
2220  #endif
2221
2222  /*
2223 @@ -85,6 +92,7 @@ struct thread_info {
2224  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2225  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2226  #define TIF_SECCOMP            8       /* secure computing */
2227 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2228  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2229  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2230  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2231 @@ -108,6 +116,7 @@ struct thread_info {
2232  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2233  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2234  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2235 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2236  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2237  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2238  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2239 @@ -143,6 +152,8 @@ struct thread_info {
2240  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2241  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2242
2243 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2244 +
2245  #define STACK_WARN             (THREAD_SIZE/8)
2246
2247  /*
2248 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2249 index 57ab86d94d64..35d25e27180f 100644
2250 --- a/arch/x86/include/asm/uv/uv_bau.h
2251 +++ b/arch/x86/include/asm/uv/uv_bau.h
2252 @@ -624,9 +624,9 @@ struct bau_control {
2253         cycles_t                send_message;
2254         cycles_t                period_end;
2255         cycles_t                period_time;
2256 -       spinlock_t              uvhub_lock;
2257 -       spinlock_t              queue_lock;
2258 -       spinlock_t              disable_lock;
2259 +       raw_spinlock_t          uvhub_lock;
2260 +       raw_spinlock_t          queue_lock;
2261 +       raw_spinlock_t          disable_lock;
2262         /* tunables */
2263         int                     max_concurr;
2264         int                     max_concurr_const;
2265 @@ -815,15 +815,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2266   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2267   * on equal.
2268   */
2269 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2270 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2271  {
2272 -       spin_lock(lock);
2273 +       raw_spin_lock(lock);
2274         if (atomic_read(v) >= u) {
2275 -               spin_unlock(lock);
2276 +               raw_spin_unlock(lock);
2277                 return 0;
2278         }
2279         atomic_inc(v);
2280 -       spin_unlock(lock);
2281 +       raw_spin_unlock(lock);
2282         return 1;
2283  }
2284
2285 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2286 index 931ced8ca345..167975ac8af7 100644
2287 --- a/arch/x86/kernel/acpi/boot.c
2288 +++ b/arch/x86/kernel/acpi/boot.c
2289 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2290   *             ->ioapic_mutex
2291   *                     ->ioapic_lock
2292   */
2293 +#ifdef CONFIG_X86_IO_APIC
2294  static DEFINE_MUTEX(acpi_ioapic_lock);
2295 +#endif
2296
2297  /* --------------------------------------------------------------------------
2298                                Boot-time Configuration
2299 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2300 index d1e25564b3c1..67e585fa801f 100644
2301 --- a/arch/x86/kernel/apic/io_apic.c
2302 +++ b/arch/x86/kernel/apic/io_apic.c
2303 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2304  static inline bool ioapic_irqd_mask(struct irq_data *data)
2305  {
2306         /* If we are moving the irq we need to mask it */
2307 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2308 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2309 +                    !irqd_irq_inprogress(data))) {
2310                 mask_ioapic_irq(data);
2311                 return true;
2312         }
2313 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2314 index c62e015b126c..0cc71257fca6 100644
2315 --- a/arch/x86/kernel/asm-offsets.c
2316 +++ b/arch/x86/kernel/asm-offsets.c
2317 @@ -36,6 +36,7 @@ void common(void) {
2318
2319         BLANK();
2320         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2321 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2322         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2323
2324         BLANK();
2325 @@ -91,4 +92,5 @@ void common(void) {
2326
2327         BLANK();
2328         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2329 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2330  }
2331 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2332 index 22cda29d654e..57c85e3af092 100644
2333 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2334 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2335 @@ -41,6 +41,8 @@
2336  #include <linux/debugfs.h>
2337  #include <linux/irq_work.h>
2338  #include <linux/export.h>
2339 +#include <linux/jiffies.h>
2340 +#include <linux/swork.h>
2341  #include <linux/jump_label.h>
2342
2343  #include <asm/processor.h>
2344 @@ -1307,7 +1309,7 @@ void mce_log_therm_throt_event(__u64 status)
2345  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2346
2347  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2348 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2349 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2350
2351  static unsigned long mce_adjust_timer_default(unsigned long interval)
2352  {
2353 @@ -1316,32 +1318,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2354
2355  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2356
2357 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2358 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2359  {
2360 -       unsigned long when = jiffies + interval;
2361 -       unsigned long flags;
2362 -
2363 -       local_irq_save(flags);
2364 -
2365 -       if (timer_pending(t)) {
2366 -               if (time_before(when, t->expires))
2367 -                       mod_timer(t, when);
2368 -       } else {
2369 -               t->expires = round_jiffies(when);
2370 -               add_timer_on(t, smp_processor_id());
2371 -       }
2372 -
2373 -       local_irq_restore(flags);
2374 +       if (!interval)
2375 +               return HRTIMER_NORESTART;
2376 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2377 +       return HRTIMER_RESTART;
2378  }
2379
2380 -static void mce_timer_fn(unsigned long data)
2381 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2382  {
2383 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2384 -       int cpu = smp_processor_id();
2385         unsigned long iv;
2386
2387 -       WARN_ON(cpu != data);
2388 -
2389         iv = __this_cpu_read(mce_next_interval);
2390
2391         if (mce_available(this_cpu_ptr(&cpu_info))) {
2392 @@ -1364,7 +1352,7 @@ static void mce_timer_fn(unsigned long data)
2393
2394  done:
2395         __this_cpu_write(mce_next_interval, iv);
2396 -       __restart_timer(t, iv);
2397 +       return __restart_timer(timer, iv);
2398  }
2399
2400  /*
2401 @@ -1372,7 +1360,7 @@ static void mce_timer_fn(unsigned long data)
2402   */
2403  void mce_timer_kick(unsigned long interval)
2404  {
2405 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2406 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2407         unsigned long iv = __this_cpu_read(mce_next_interval);
2408
2409         __restart_timer(t, interval);
2410 @@ -1387,7 +1375,7 @@ static void mce_timer_delete_all(void)
2411         int cpu;
2412
2413         for_each_online_cpu(cpu)
2414 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2415 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2416  }
2417
2418  static void mce_do_trigger(struct work_struct *work)
2419 @@ -1397,6 +1385,56 @@ static void mce_do_trigger(struct work_struct *work)
2420
2421  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2422
2423 +static void __mce_notify_work(struct swork_event *event)
2424 +{
2425 +       /* Not more than two messages every minute */
2426 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2427 +
2428 +       /* wake processes polling /dev/mcelog */
2429 +       wake_up_interruptible(&mce_chrdev_wait);
2430 +
2431 +       /*
2432 +        * There is no risk of missing notifications because
2433 +        * work_pending is always cleared before the function is
2434 +        * executed.
2435 +        */
2436 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2437 +               schedule_work(&mce_trigger_work);
2438 +
2439 +       if (__ratelimit(&ratelimit))
2440 +               pr_info(HW_ERR "Machine check events logged\n");
2441 +}
2442 +
2443 +#ifdef CONFIG_PREEMPT_RT_FULL
2444 +static bool notify_work_ready __read_mostly;
2445 +static struct swork_event notify_work;
2446 +
2447 +static int mce_notify_work_init(void)
2448 +{
2449 +       int err;
2450 +
2451 +       err = swork_get();
2452 +       if (err)
2453 +               return err;
2454 +
2455 +       INIT_SWORK(&notify_work, __mce_notify_work);
2456 +       notify_work_ready = true;
2457 +       return 0;
2458 +}
2459 +
2460 +static void mce_notify_work(void)
2461 +{
2462 +       if (notify_work_ready)
2463 +               swork_queue(&notify_work);
2464 +}
2465 +#else
2466 +static void mce_notify_work(void)
2467 +{
2468 +       __mce_notify_work(NULL);
2469 +}
2470 +static inline int mce_notify_work_init(void) { return 0; }
2471 +#endif
2472 +
2473  /*
2474   * Notify the user(s) about new machine check events.
2475   * Can be called from interrupt context, but not from machine check/NMI
2476 @@ -1404,19 +1442,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2477   */
2478  int mce_notify_irq(void)
2479  {
2480 -       /* Not more than two messages every minute */
2481 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2482 -
2483         if (test_and_clear_bit(0, &mce_need_notify)) {
2484 -               /* wake processes polling /dev/mcelog */
2485 -               wake_up_interruptible(&mce_chrdev_wait);
2486 -
2487 -               if (mce_helper[0])
2488 -                       schedule_work(&mce_trigger_work);
2489 -
2490 -               if (__ratelimit(&ratelimit))
2491 -                       pr_info(HW_ERR "Machine check events logged\n");
2492 -
2493 +               mce_notify_work();
2494                 return 1;
2495         }
2496         return 0;
2497 @@ -1722,7 +1749,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2498         }
2499  }
2500
2501 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2502 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2503  {
2504         unsigned long iv = check_interval * HZ;
2505
2506 @@ -1731,16 +1758,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2507
2508         per_cpu(mce_next_interval, cpu) = iv;
2509
2510 -       t->expires = round_jiffies(jiffies + iv);
2511 -       add_timer_on(t, cpu);
2512 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2513 +                       0, HRTIMER_MODE_REL_PINNED);
2514  }
2515
2516  static void __mcheck_cpu_init_timer(void)
2517  {
2518 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2519 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2520         unsigned int cpu = smp_processor_id();
2521
2522 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2523 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2524 +       t->function = mce_timer_fn;
2525         mce_start_timer(cpu, t);
2526  }
2527
2528 @@ -2465,6 +2493,8 @@ static void mce_disable_cpu(void *h)
2529         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2530                 return;
2531
2532 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2533 +
2534         if (!(action & CPU_TASKS_FROZEN))
2535                 cmci_clear();
2536
2537 @@ -2487,6 +2517,7 @@ static void mce_reenable_cpu(void *h)
2538                 if (b->init)
2539                         wrmsrl(msr_ops.ctl(i), b->ctl);
2540         }
2541 +       __mcheck_cpu_init_timer();
2542  }
2543
2544  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2545 @@ -2494,7 +2525,6 @@ static int
2546  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2547  {
2548         unsigned int cpu = (unsigned long)hcpu;
2549 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2550
2551         switch (action & ~CPU_TASKS_FROZEN) {
2552         case CPU_ONLINE:
2553 @@ -2514,11 +2544,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2554                 break;
2555         case CPU_DOWN_PREPARE:
2556                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2557 -               del_timer_sync(t);
2558                 break;
2559         case CPU_DOWN_FAILED:
2560                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2561 -               mce_start_timer(cpu, t);
2562                 break;
2563         }
2564
2565 @@ -2557,6 +2585,10 @@ static __init int mcheck_init_device(void)
2566                 goto err_out;
2567         }
2568
2569 +       err = mce_notify_work_init();
2570 +       if (err)
2571 +               goto err_out;
2572 +
2573         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2574                 err = -ENOMEM;
2575                 goto err_out;
2576 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2577 index 1f38d9a4d9de..053bf3b2ef39 100644
2578 --- a/arch/x86/kernel/irq_32.c
2579 +++ b/arch/x86/kernel/irq_32.c
2580 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2581                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2582  }
2583
2584 +#ifndef CONFIG_PREEMPT_RT_FULL
2585  void do_softirq_own_stack(void)
2586  {
2587         struct irq_stack *irqstk;
2588 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2589
2590         call_on_stack(__do_softirq, isp);
2591  }
2592 +#endif
2593
2594  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2595  {
2596 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2597 index bd7be8efdc4c..b3b0a7f7b1ca 100644
2598 --- a/arch/x86/kernel/process_32.c
2599 +++ b/arch/x86/kernel/process_32.c
2600 @@ -35,6 +35,7 @@
2601  #include <linux/uaccess.h>
2602  #include <linux/io.h>
2603  #include <linux/kdebug.h>
2604 +#include <linux/highmem.h>
2605
2606  #include <asm/pgtable.h>
2607  #include <asm/ldt.h>
2608 @@ -195,6 +196,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2609  }
2610  EXPORT_SYMBOL_GPL(start_thread);
2611
2612 +#ifdef CONFIG_PREEMPT_RT_FULL
2613 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2614 +{
2615 +       int i;
2616 +
2617 +       /*
2618 +        * Clear @prev's kmap_atomic mappings
2619 +        */
2620 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2621 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2622 +               pte_t *ptep = kmap_pte - idx;
2623 +
2624 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2625 +       }
2626 +       /*
2627 +        * Restore @next_p's kmap_atomic mappings
2628 +        */
2629 +       for (i = 0; i < next_p->kmap_idx; i++) {
2630 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2631 +
2632 +               if (!pte_none(next_p->kmap_pte[i]))
2633 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2634 +       }
2635 +}
2636 +#else
2637 +static inline void
2638 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2639 +#endif
2640 +
2641
2642  /*
2643   *     switch_to(x,y) should switch tasks from x to y.
2644 @@ -271,6 +301,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2645                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2646                 __switch_to_xtra(prev_p, next_p, tss);
2647
2648 +       switch_kmaps(prev_p, next_p);
2649 +
2650         /*
2651          * Leave lazy mode, flushing any hypercalls made here.
2652          * This must be done before restoring TLS segments so
2653 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2654 index 3f05c044720b..fe68afd37162 100644
2655 --- a/arch/x86/kvm/lapic.c
2656 +++ b/arch/x86/kvm/lapic.c
2657 @@ -1939,6 +1939,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2658         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2659                      HRTIMER_MODE_ABS_PINNED);
2660         apic->lapic_timer.timer.function = apic_timer_fn;
2661 +       apic->lapic_timer.timer.irqsafe = 1;
2662
2663         /*
2664          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2665 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2666 index e5bc139d1ba7..fa0aa5931a4b 100644
2667 --- a/arch/x86/kvm/x86.c
2668 +++ b/arch/x86/kvm/x86.c
2669 @@ -5933,6 +5933,13 @@ int kvm_arch_init(void *opaque)
2670                 goto out;
2671         }
2672
2673 +#ifdef CONFIG_PREEMPT_RT_FULL
2674 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2675 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2676 +               return -EOPNOTSUPP;
2677 +       }
2678 +#endif
2679 +
2680         r = kvm_mmu_module_init();
2681         if (r)
2682                 goto out_free_percpu;
2683 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2684 index 6d18b70ed5a9..f752724c22e8 100644
2685 --- a/arch/x86/mm/highmem_32.c
2686 +++ b/arch/x86/mm/highmem_32.c
2687 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2688   */
2689  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2690  {
2691 +       pte_t pte = mk_pte(page, prot);
2692         unsigned long vaddr;
2693         int idx, type;
2694
2695 -       preempt_disable();
2696 +       preempt_disable_nort();
2697         pagefault_disable();
2698
2699         if (!PageHighMem(page))
2700 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2701         idx = type + KM_TYPE_NR*smp_processor_id();
2702         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2703         BUG_ON(!pte_none(*(kmap_pte-idx)));
2704 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2705 +#ifdef CONFIG_PREEMPT_RT_FULL
2706 +       current->kmap_pte[type] = pte;
2707 +#endif
2708 +       set_pte(kmap_pte-idx, pte);
2709         arch_flush_lazy_mmu_mode();
2710
2711         return (void *)vaddr;
2712 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2713                  * is a bad idea also, in case the page changes cacheability
2714                  * attributes or becomes a protected page in a hypervisor.
2715                  */
2716 +#ifdef CONFIG_PREEMPT_RT_FULL
2717 +               current->kmap_pte[type] = __pte(0);
2718 +#endif
2719                 kpte_clear_flush(kmap_pte-idx, vaddr);
2720                 kmap_atomic_idx_pop();
2721                 arch_flush_lazy_mmu_mode();
2722 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2723  #endif
2724
2725         pagefault_enable();
2726 -       preempt_enable();
2727 +       preempt_enable_nort();
2728  }
2729  EXPORT_SYMBOL(__kunmap_atomic);
2730
2731 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2732 index ada98b39b8ad..585f6829653b 100644
2733 --- a/arch/x86/mm/iomap_32.c
2734 +++ b/arch/x86/mm/iomap_32.c
2735 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2736
2737  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2738  {
2739 +       pte_t pte = pfn_pte(pfn, prot);
2740         unsigned long vaddr;
2741         int idx, type;
2742
2743 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2744         type = kmap_atomic_idx_push();
2745         idx = type + KM_TYPE_NR * smp_processor_id();
2746         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2747 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2748 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2749 +
2750 +#ifdef CONFIG_PREEMPT_RT_FULL
2751 +       current->kmap_pte[type] = pte;
2752 +#endif
2753 +       set_pte(kmap_pte - idx, pte);
2754         arch_flush_lazy_mmu_mode();
2755
2756         return (void *)vaddr;
2757 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2758                  * is a bad idea also, in case the page changes cacheability
2759                  * attributes or becomes a protected page in a hypervisor.
2760                  */
2761 +#ifdef CONFIG_PREEMPT_RT_FULL
2762 +               current->kmap_pte[type] = __pte(0);
2763 +#endif
2764                 kpte_clear_flush(kmap_pte-idx, vaddr);
2765                 kmap_atomic_idx_pop();
2766         }
2767 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2768 index e3353c97d086..01664968555c 100644
2769 --- a/arch/x86/mm/pageattr.c
2770 +++ b/arch/x86/mm/pageattr.c
2771 @@ -214,7 +214,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
2772                             int in_flags, struct page **pages)
2773  {
2774         unsigned int i, level;
2775 +#ifdef CONFIG_PREEMPT
2776 +       /*
2777 +        * Avoid wbinvd() because it causes latencies on all CPUs,
2778 +        * regardless of any CPU isolation that may be in effect.
2779 +        */
2780 +       unsigned long do_wbinvd = 0;
2781 +#else
2782         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
2783 +#endif
2784
2785         BUG_ON(irqs_disabled());
2786
2787 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2788 index 9e42842e924a..5398f97172f9 100644
2789 --- a/arch/x86/platform/uv/tlb_uv.c
2790 +++ b/arch/x86/platform/uv/tlb_uv.c
2791 @@ -748,9 +748,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2792
2793                 quiesce_local_uvhub(hmaster);
2794
2795 -               spin_lock(&hmaster->queue_lock);
2796 +               raw_spin_lock(&hmaster->queue_lock);
2797                 reset_with_ipi(&bau_desc->distribution, bcp);
2798 -               spin_unlock(&hmaster->queue_lock);
2799 +               raw_spin_unlock(&hmaster->queue_lock);
2800
2801                 end_uvhub_quiesce(hmaster);
2802
2803 @@ -770,9 +770,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2804
2805                 quiesce_local_uvhub(hmaster);
2806
2807 -               spin_lock(&hmaster->queue_lock);
2808 +               raw_spin_lock(&hmaster->queue_lock);
2809                 reset_with_ipi(&bau_desc->distribution, bcp);
2810 -               spin_unlock(&hmaster->queue_lock);
2811 +               raw_spin_unlock(&hmaster->queue_lock);
2812
2813                 end_uvhub_quiesce(hmaster);
2814
2815 @@ -793,7 +793,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2816         cycles_t tm1;
2817
2818         hmaster = bcp->uvhub_master;
2819 -       spin_lock(&hmaster->disable_lock);
2820 +       raw_spin_lock(&hmaster->disable_lock);
2821         if (!bcp->baudisabled) {
2822                 stat->s_bau_disabled++;
2823                 tm1 = get_cycles();
2824 @@ -806,7 +806,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2825                         }
2826                 }
2827         }
2828 -       spin_unlock(&hmaster->disable_lock);
2829 +       raw_spin_unlock(&hmaster->disable_lock);
2830  }
2831
2832  static void count_max_concurr(int stat, struct bau_control *bcp,
2833 @@ -869,7 +869,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2834   */
2835  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2836  {
2837 -       spinlock_t *lock = &hmaster->uvhub_lock;
2838 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2839         atomic_t *v;
2840
2841         v = &hmaster->active_descriptor_count;
2842 @@ -1002,7 +1002,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2843         struct bau_control *hmaster;
2844
2845         hmaster = bcp->uvhub_master;
2846 -       spin_lock(&hmaster->disable_lock);
2847 +       raw_spin_lock(&hmaster->disable_lock);
2848         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2849                 stat->s_bau_reenabled++;
2850                 for_each_present_cpu(tcpu) {
2851 @@ -1014,10 +1014,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2852                                 tbcp->period_giveups = 0;
2853                         }
2854                 }
2855 -               spin_unlock(&hmaster->disable_lock);
2856 +               raw_spin_unlock(&hmaster->disable_lock);
2857                 return 0;
2858         }
2859 -       spin_unlock(&hmaster->disable_lock);
2860 +       raw_spin_unlock(&hmaster->disable_lock);
2861         return -1;
2862  }
2863
2864 @@ -1940,9 +1940,9 @@ static void __init init_per_cpu_tunables(void)
2865                 bcp->cong_reps                  = congested_reps;
2866                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2867                 bcp->giveup_limit               = giveup_limit;
2868 -               spin_lock_init(&bcp->queue_lock);
2869 -               spin_lock_init(&bcp->uvhub_lock);
2870 -               spin_lock_init(&bcp->disable_lock);
2871 +               raw_spin_lock_init(&bcp->queue_lock);
2872 +               raw_spin_lock_init(&bcp->uvhub_lock);
2873 +               raw_spin_lock_init(&bcp->disable_lock);
2874         }
2875  }
2876
2877 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2878 index b333fc45f9ec..8b85916e6986 100644
2879 --- a/arch/x86/platform/uv/uv_time.c
2880 +++ b/arch/x86/platform/uv/uv_time.c
2881 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2882
2883  /* There is one of these allocated per node */
2884  struct uv_rtc_timer_head {
2885 -       spinlock_t      lock;
2886 +       raw_spinlock_t  lock;
2887         /* next cpu waiting for timer, local node relative: */
2888         int             next_cpu;
2889         /* number of cpus on this node: */
2890 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2891                                 uv_rtc_deallocate_timers();
2892                                 return -ENOMEM;
2893                         }
2894 -                       spin_lock_init(&head->lock);
2895 +                       raw_spin_lock_init(&head->lock);
2896                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2897                         head->next_cpu = -1;
2898                         blade_info[bid] = head;
2899 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2900         unsigned long flags;
2901         int next_cpu;
2902
2903 -       spin_lock_irqsave(&head->lock, flags);
2904 +       raw_spin_lock_irqsave(&head->lock, flags);
2905
2906         next_cpu = head->next_cpu;
2907         *t = expires;
2908 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2909                 if (uv_setup_intr(cpu, expires)) {
2910                         *t = ULLONG_MAX;
2911                         uv_rtc_find_next_timer(head, pnode);
2912 -                       spin_unlock_irqrestore(&head->lock, flags);
2913 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2914                         return -ETIME;
2915                 }
2916         }
2917
2918 -       spin_unlock_irqrestore(&head->lock, flags);
2919 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2920         return 0;
2921  }
2922
2923 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2924         unsigned long flags;
2925         int rc = 0;
2926
2927 -       spin_lock_irqsave(&head->lock, flags);
2928 +       raw_spin_lock_irqsave(&head->lock, flags);
2929
2930         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2931                 rc = 1;
2932 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2933                         uv_rtc_find_next_timer(head, pnode);
2934         }
2935
2936 -       spin_unlock_irqrestore(&head->lock, flags);
2937 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2938
2939         return rc;
2940  }
2941 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2942  static cycle_t uv_read_rtc(struct clocksource *cs)
2943  {
2944         unsigned long offset;
2945 +       cycle_t cycles;
2946
2947 +       preempt_disable();
2948         if (uv_get_min_hub_revision_id() == 1)
2949                 offset = 0;
2950         else
2951                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2952
2953 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2954 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2955 +       preempt_enable();
2956 +
2957 +       return cycles;
2958  }
2959
2960  /*
2961 diff --git a/block/blk-core.c b/block/blk-core.c
2962 index d1f2801ce836..6f945bb0fa1a 100644
2963 --- a/block/blk-core.c
2964 +++ b/block/blk-core.c
2965 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
2966
2967         INIT_LIST_HEAD(&rq->queuelist);
2968         INIT_LIST_HEAD(&rq->timeout_list);
2969 +#ifdef CONFIG_PREEMPT_RT_FULL
2970 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2971 +#endif
2972         rq->cpu = -1;
2973         rq->q = q;
2974         rq->__sector = (sector_t) -1;
2975 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
2976   **/
2977  void blk_start_queue(struct request_queue *q)
2978  {
2979 -       WARN_ON(!irqs_disabled());
2980 +       WARN_ON_NONRT(!irqs_disabled());
2981
2982         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2983         __blk_run_queue(q);
2984 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
2985                 if (nowait)
2986                         return -EBUSY;
2987
2988 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2989 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2990                                 !atomic_read(&q->mq_freeze_depth) ||
2991                                 blk_queue_dying(q));
2992                 if (blk_queue_dying(q))
2993 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
2994         struct request_queue *q =
2995                 container_of(ref, struct request_queue, q_usage_counter);
2996
2997 -       wake_up_all(&q->mq_freeze_wq);
2998 +       swake_up_all(&q->mq_freeze_wq);
2999  }
3000
3001  static void blk_rq_timed_out_timer(unsigned long data)
3002 @@ -748,7 +751,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3003         q->bypass_depth = 1;
3004         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3005
3006 -       init_waitqueue_head(&q->mq_freeze_wq);
3007 +       init_swait_queue_head(&q->mq_freeze_wq);
3008
3009         /*
3010          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3011 @@ -3200,7 +3203,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3012                 blk_run_queue_async(q);
3013         else
3014                 __blk_run_queue(q);
3015 -       spin_unlock(q->queue_lock);
3016 +       spin_unlock_irq(q->queue_lock);
3017  }
3018
3019  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3020 @@ -3248,7 +3251,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3021  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3022  {
3023         struct request_queue *q;
3024 -       unsigned long flags;
3025         struct request *rq;
3026         LIST_HEAD(list);
3027         unsigned int depth;
3028 @@ -3268,11 +3270,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3029         q = NULL;
3030         depth = 0;
3031
3032 -       /*
3033 -        * Save and disable interrupts here, to avoid doing it for every
3034 -        * queue lock we have to take.
3035 -        */
3036 -       local_irq_save(flags);
3037         while (!list_empty(&list)) {
3038                 rq = list_entry_rq(list.next);
3039                 list_del_init(&rq->queuelist);
3040 @@ -3285,7 +3282,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3041                                 queue_unplugged(q, depth, from_schedule);
3042                         q = rq->q;
3043                         depth = 0;
3044 -                       spin_lock(q->queue_lock);
3045 +                       spin_lock_irq(q->queue_lock);
3046                 }
3047
3048                 /*
3049 @@ -3312,8 +3309,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3050          */
3051         if (q)
3052                 queue_unplugged(q, depth, from_schedule);
3053 -
3054 -       local_irq_restore(flags);
3055  }
3056
3057  void blk_finish_plug(struct blk_plug *plug)
3058 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3059 index 381cb50a673c..dc8785233d94 100644
3060 --- a/block/blk-ioc.c
3061 +++ b/block/blk-ioc.c
3062 @@ -7,6 +7,7 @@
3063  #include <linux/bio.h>
3064  #include <linux/blkdev.h>
3065  #include <linux/slab.h>
3066 +#include <linux/delay.h>
3067
3068  #include "blk.h"
3069
3070 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3071                         spin_unlock(q->queue_lock);
3072                 } else {
3073                         spin_unlock_irqrestore(&ioc->lock, flags);
3074 -                       cpu_relax();
3075 +                       cpu_chill();
3076                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3077                 }
3078         }
3079 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3080                         spin_unlock(icq->q->queue_lock);
3081                 } else {
3082                         spin_unlock_irqrestore(&ioc->lock, flags);
3083 -                       cpu_relax();
3084 +                       cpu_chill();
3085                         goto retry;
3086                 }
3087         }
3088 diff --git a/block/blk-mq.c b/block/blk-mq.c
3089 index 7b597ec4e9c5..48c9652a701c 100644
3090 --- a/block/blk-mq.c
3091 +++ b/block/blk-mq.c
3092 @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3093
3094  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3095  {
3096 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3097 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3098  }
3099
3100  /*
3101 @@ -110,7 +110,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3102         WARN_ON_ONCE(freeze_depth < 0);
3103         if (!freeze_depth) {
3104                 percpu_ref_reinit(&q->q_usage_counter);
3105 -               wake_up_all(&q->mq_freeze_wq);
3106 +               swake_up_all(&q->mq_freeze_wq);
3107         }
3108  }
3109  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3110 @@ -129,7 +129,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3111          * dying, we need to ensure that processes currently waiting on
3112          * the queue are notified as well.
3113          */
3114 -       wake_up_all(&q->mq_freeze_wq);
3115 +       swake_up_all(&q->mq_freeze_wq);
3116  }
3117
3118  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3119 @@ -177,6 +177,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3120         rq->resid_len = 0;
3121         rq->sense = NULL;
3122
3123 +#ifdef CONFIG_PREEMPT_RT_FULL
3124 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3125 +#endif
3126         INIT_LIST_HEAD(&rq->timeout_list);
3127         rq->timeout = 0;
3128
3129 @@ -345,6 +348,17 @@ void blk_mq_end_request(struct request *rq, int error)
3130  }
3131  EXPORT_SYMBOL(blk_mq_end_request);
3132
3133 +#ifdef CONFIG_PREEMPT_RT_FULL
3134 +
3135 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3136 +{
3137 +       struct request *rq = container_of(work, struct request, work);
3138 +
3139 +       rq->q->softirq_done_fn(rq);
3140 +}
3141 +
3142 +#else
3143 +
3144  static void __blk_mq_complete_request_remote(void *data)
3145  {
3146         struct request *rq = data;
3147 @@ -352,6 +366,8 @@ static void __blk_mq_complete_request_remote(void *data)
3148         rq->q->softirq_done_fn(rq);
3149  }
3150
3151 +#endif
3152 +
3153  static void blk_mq_ipi_complete_request(struct request *rq)
3154  {
3155         struct blk_mq_ctx *ctx = rq->mq_ctx;
3156 @@ -363,19 +379,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3157                 return;
3158         }
3159
3160 -       cpu = get_cpu();
3161 +       cpu = get_cpu_light();
3162         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3163                 shared = cpus_share_cache(cpu, ctx->cpu);
3164
3165         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3166 +#ifdef CONFIG_PREEMPT_RT_FULL
3167 +               schedule_work_on(ctx->cpu, &rq->work);
3168 +#else
3169                 rq->csd.func = __blk_mq_complete_request_remote;
3170                 rq->csd.info = rq;
3171                 rq->csd.flags = 0;
3172                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3173 +#endif
3174         } else {
3175                 rq->q->softirq_done_fn(rq);
3176         }
3177 -       put_cpu();
3178 +       put_cpu_light();
3179  }
3180
3181  static void __blk_mq_complete_request(struct request *rq)
3182 @@ -906,14 +926,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3183                 return;
3184
3185         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3186 -               int cpu = get_cpu();
3187 +               int cpu = get_cpu_light();
3188                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3189                         __blk_mq_run_hw_queue(hctx);
3190 -                       put_cpu();
3191 +                       put_cpu_light();
3192                         return;
3193                 }
3194
3195 -               put_cpu();
3196 +               put_cpu_light();
3197         }
3198
3199         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3200 diff --git a/block/blk-mq.h b/block/blk-mq.h
3201 index e5d25249028c..1e846b842eab 100644
3202 --- a/block/blk-mq.h
3203 +++ b/block/blk-mq.h
3204 @@ -72,12 +72,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3205   */
3206  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3207  {
3208 -       return __blk_mq_get_ctx(q, get_cpu());
3209 +       return __blk_mq_get_ctx(q, get_cpu_light());
3210  }
3211
3212  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3213  {
3214 -       put_cpu();
3215 +       put_cpu_light();
3216  }
3217
3218  struct blk_mq_alloc_data {
3219 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3220 index 06cf9807f49a..c40342643ca0 100644
3221 --- a/block/blk-softirq.c
3222 +++ b/block/blk-softirq.c
3223 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3224                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3225
3226         local_irq_restore(flags);
3227 +       preempt_check_resched_rt();
3228  }
3229
3230  /*
3231 @@ -89,6 +90,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
3232                          this_cpu_ptr(&blk_cpu_done));
3233         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3234         local_irq_enable();
3235 +       preempt_check_resched_rt();
3236
3237         return 0;
3238  }
3239 @@ -141,6 +143,7 @@ void __blk_complete_request(struct request *req)
3240                 goto do_local;
3241
3242         local_irq_restore(flags);
3243 +       preempt_check_resched_rt();
3244  }
3245
3246  /**
3247 diff --git a/block/bounce.c b/block/bounce.c
3248 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3249 --- a/block/bounce.c
3250 +++ b/block/bounce.c
3251 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3252         unsigned long flags;
3253         unsigned char *vto;
3254
3255 -       local_irq_save(flags);
3256 +       local_irq_save_nort(flags);
3257         vto = kmap_atomic(to->bv_page);
3258         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3259         kunmap_atomic(vto);
3260 -       local_irq_restore(flags);
3261 +       local_irq_restore_nort(flags);
3262  }
3263
3264  #else /* CONFIG_HIGHMEM */
3265 diff --git a/crypto/algapi.c b/crypto/algapi.c
3266 index 1fad2a6b3bbb..ecb7315426a9 100644
3267 --- a/crypto/algapi.c
3268 +++ b/crypto/algapi.c
3269 @@ -719,13 +719,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3270
3271  int crypto_register_notifier(struct notifier_block *nb)
3272  {
3273 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3274 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3275  }
3276  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3277
3278  int crypto_unregister_notifier(struct notifier_block *nb)
3279  {
3280 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3281 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3282  }
3283  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3284
3285 diff --git a/crypto/api.c b/crypto/api.c
3286 index bbc147cb5dec..bc1a848f02ec 100644
3287 --- a/crypto/api.c
3288 +++ b/crypto/api.c
3289 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3290  DECLARE_RWSEM(crypto_alg_sem);
3291  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3292
3293 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3294 +SRCU_NOTIFIER_HEAD(crypto_chain);
3295  EXPORT_SYMBOL_GPL(crypto_chain);
3296
3297  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3298 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3299  {
3300         int ok;
3301
3302 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3303 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3304         if (ok == NOTIFY_DONE) {
3305                 request_module("cryptomgr");
3306 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3307 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3308         }
3309
3310         return ok;
3311 diff --git a/crypto/internal.h b/crypto/internal.h
3312 index 7eefcdb00227..0ecc7f5a2f40 100644
3313 --- a/crypto/internal.h
3314 +++ b/crypto/internal.h
3315 @@ -47,7 +47,7 @@ struct crypto_larval {
3316
3317  extern struct list_head crypto_alg_list;
3318  extern struct rw_semaphore crypto_alg_sem;
3319 -extern struct blocking_notifier_head crypto_chain;
3320 +extern struct srcu_notifier_head crypto_chain;
3321
3322  #ifdef CONFIG_PROC_FS
3323  void __init crypto_init_proc(void);
3324 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3325
3326  static inline void crypto_notify(unsigned long val, void *v)
3327  {
3328 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3329 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3330  }
3331
3332  #endif /* _CRYPTO_INTERNAL_H */
3333 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3334 index 750fa824d42c..441edf51484a 100644
3335 --- a/drivers/acpi/acpica/acglobal.h
3336 +++ b/drivers/acpi/acpica/acglobal.h
3337 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3338   * interrupt level
3339   */
3340  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3341 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3342 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3343  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3344
3345  /* Mutex for _OSI support */
3346 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3347 index 3b7fb99362b6..696bf8e62afb 100644
3348 --- a/drivers/acpi/acpica/hwregs.c
3349 +++ b/drivers/acpi/acpica/hwregs.c
3350 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3351                           ACPI_BITMASK_ALL_FIXED_STATUS,
3352                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3353
3354 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3355 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3356
3357         /* Clear the fixed events in PM1 A/B */
3358
3359         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3360                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3361
3362 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3363 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3364
3365         if (ACPI_FAILURE(status)) {
3366                 goto exit;
3367 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3368 index 98c26ff39409..6e236f2ea791 100644
3369 --- a/drivers/acpi/acpica/hwxface.c
3370 +++ b/drivers/acpi/acpica/hwxface.c
3371 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3372                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3373         }
3374
3375 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3376 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3377
3378         /*
3379          * At this point, we know that the parent register is one of the
3380 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3381
3382  unlock_and_exit:
3383
3384 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3385 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3386         return_ACPI_STATUS(status);
3387  }
3388
3389 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3390 index 15073375bd00..357e7ca5a587 100644
3391 --- a/drivers/acpi/acpica/utmutex.c
3392 +++ b/drivers/acpi/acpica/utmutex.c
3393 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3394                 return_ACPI_STATUS (status);
3395         }
3396
3397 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3398 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3399         if (ACPI_FAILURE (status)) {
3400                 return_ACPI_STATUS (status);
3401         }
3402 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3403         /* Delete the spinlocks */
3404
3405         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3406 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3407 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3408         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3409
3410         /* Delete the reader/writer lock */
3411 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3412 index 051b6158d1b7..7ad293bef6ed 100644
3413 --- a/drivers/ata/libata-sff.c
3414 +++ b/drivers/ata/libata-sff.c
3415 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3416         unsigned long flags;
3417         unsigned int consumed;
3418
3419 -       local_irq_save(flags);
3420 +       local_irq_save_nort(flags);
3421         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3422 -       local_irq_restore(flags);
3423 +       local_irq_restore_nort(flags);
3424
3425         return consumed;
3426  }
3427 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3428                 unsigned long flags;
3429
3430                 /* FIXME: use a bounce buffer */
3431 -               local_irq_save(flags);
3432 +               local_irq_save_nort(flags);
3433                 buf = kmap_atomic(page);
3434
3435                 /* do the actual data transfer */
3436 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3437                                        do_write);
3438
3439                 kunmap_atomic(buf);
3440 -               local_irq_restore(flags);
3441 +               local_irq_restore_nort(flags);
3442         } else {
3443                 buf = page_address(page);
3444                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3445 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3446                 unsigned long flags;
3447
3448                 /* FIXME: use bounce buffer */
3449 -               local_irq_save(flags);
3450 +               local_irq_save_nort(flags);
3451                 buf = kmap_atomic(page);
3452
3453                 /* do the actual data transfer */
3454 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3455                                                                 count, rw);
3456
3457                 kunmap_atomic(buf);
3458 -               local_irq_restore(flags);
3459 +               local_irq_restore_nort(flags);
3460         } else {
3461                 buf = page_address(page);
3462                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3463 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3464 index 4b5cd3a7b2b6..fa8329ad79fd 100644
3465 --- a/drivers/block/zram/zcomp.c
3466 +++ b/drivers/block/zram/zcomp.c
3467 @@ -118,12 +118,19 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3468
3469  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3470  {
3471 -       return *get_cpu_ptr(comp->stream);
3472 +       struct zcomp_strm *zstrm;
3473 +
3474 +       zstrm = *this_cpu_ptr(comp->stream);
3475 +       spin_lock(&zstrm->zcomp_lock);
3476 +       return zstrm;
3477  }
3478
3479  void zcomp_stream_put(struct zcomp *comp)
3480  {
3481 -       put_cpu_ptr(comp->stream);
3482 +       struct zcomp_strm *zstrm;
3483 +
3484 +       zstrm = *this_cpu_ptr(comp->stream);
3485 +       spin_unlock(&zstrm->zcomp_lock);
3486  }
3487
3488  int zcomp_compress(struct zcomp_strm *zstrm,
3489 @@ -174,6 +181,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3490                         pr_err("Can't allocate a compression stream\n");
3491                         return NOTIFY_BAD;
3492                 }
3493 +               spin_lock_init(&zstrm->zcomp_lock);
3494                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3495                 break;
3496         case CPU_DEAD:
3497 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3498 index 478cac2ed465..f7a6efdc3285 100644
3499 --- a/drivers/block/zram/zcomp.h
3500 +++ b/drivers/block/zram/zcomp.h
3501 @@ -14,6 +14,7 @@ struct zcomp_strm {
3502         /* compression/decompression buffer */
3503         void *buffer;
3504         struct crypto_comp *tfm;
3505 +       spinlock_t zcomp_lock;
3506  };
3507
3508  /* dynamic per-device compression frontend */
3509 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3510 index c9914d653968..2038d138f286 100644
3511 --- a/drivers/block/zram/zram_drv.c
3512 +++ b/drivers/block/zram/zram_drv.c
3513 @@ -528,6 +528,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3514                 goto out_error;
3515         }
3516
3517 +       zram_meta_init_table_locks(meta, disksize);
3518 +
3519         return meta;
3520
3521  out_error:
3522 @@ -575,28 +577,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3523         struct zram_meta *meta = zram->meta;
3524         unsigned long handle;
3525         unsigned int size;
3526 +       struct zcomp_strm *zstrm;
3527
3528 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3529 +       zram_lock_table(&meta->table[index]);
3530         handle = meta->table[index].handle;
3531         size = zram_get_obj_size(meta, index);
3532
3533         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3534 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3535 +               zram_unlock_table(&meta->table[index]);
3536                 memset(mem, 0, PAGE_SIZE);
3537                 return 0;
3538         }
3539
3540 +       zstrm = zcomp_stream_get(zram->comp);
3541         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3542         if (size == PAGE_SIZE) {
3543                 memcpy(mem, cmem, PAGE_SIZE);
3544         } else {
3545 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3546 -
3547                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3548 -               zcomp_stream_put(zram->comp);
3549         }
3550         zs_unmap_object(meta->mem_pool, handle);
3551 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3552 +       zcomp_stream_put(zram->comp);
3553 +       zram_unlock_table(&meta->table[index]);
3554
3555         /* Should NEVER happen. Return bio error if it does. */
3556         if (unlikely(ret)) {
3557 @@ -616,14 +618,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3558         struct zram_meta *meta = zram->meta;
3559         page = bvec->bv_page;
3560
3561 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3562 +       zram_lock_table(&meta->table[index]);
3563         if (unlikely(!meta->table[index].handle) ||
3564                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3565 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3566 +               zram_unlock_table(&meta->table[index]);
3567                 handle_zero_page(bvec);
3568                 return 0;
3569         }
3570 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3571 +       zram_unlock_table(&meta->table[index]);
3572
3573         if (is_partial_io(bvec))
3574                 /* Use  a temporary buffer to decompress the page */
3575 @@ -700,10 +702,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3576                 if (user_mem)
3577                         kunmap_atomic(user_mem);
3578                 /* Free memory associated with this sector now. */
3579 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3580 +               zram_lock_table(&meta->table[index]);
3581                 zram_free_page(zram, index);
3582                 zram_set_flag(meta, index, ZRAM_ZERO);
3583 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3584 +               zram_unlock_table(&meta->table[index]);
3585
3586                 atomic64_inc(&zram->stats.zero_pages);
3587                 ret = 0;
3588 @@ -794,12 +796,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3589          * Free memory associated with this sector
3590          * before overwriting unused sectors.
3591          */
3592 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3593 +       zram_lock_table(&meta->table[index]);
3594         zram_free_page(zram, index);
3595
3596         meta->table[index].handle = handle;
3597         zram_set_obj_size(meta, index, clen);
3598 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3599 +       zram_unlock_table(&meta->table[index]);
3600
3601         /* Update stats */
3602         atomic64_add(clen, &zram->stats.compr_data_size);
3603 @@ -842,9 +844,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3604         }
3605
3606         while (n >= PAGE_SIZE) {
3607 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3608 +               zram_lock_table(&meta->table[index]);
3609                 zram_free_page(zram, index);
3610 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3611 +               zram_unlock_table(&meta->table[index]);
3612                 atomic64_inc(&zram->stats.notify_free);
3613                 index++;
3614                 n -= PAGE_SIZE;
3615 @@ -973,9 +975,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3616         zram = bdev->bd_disk->private_data;
3617         meta = zram->meta;
3618
3619 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3620 +       zram_lock_table(&meta->table[index]);
3621         zram_free_page(zram, index);
3622 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3623 +       zram_unlock_table(&meta->table[index]);
3624         atomic64_inc(&zram->stats.notify_free);
3625  }
3626
3627 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3628 index 74fcf10da374..fd4020c99b9e 100644
3629 --- a/drivers/block/zram/zram_drv.h
3630 +++ b/drivers/block/zram/zram_drv.h
3631 @@ -73,6 +73,9 @@ enum zram_pageflags {
3632  struct zram_table_entry {
3633         unsigned long handle;
3634         unsigned long value;
3635 +#ifdef CONFIG_PREEMPT_RT_BASE
3636 +       spinlock_t lock;
3637 +#endif
3638  };
3639
3640  struct zram_stats {
3641 @@ -120,4 +123,42 @@ struct zram {
3642          */
3643         bool claim; /* Protected by bdev->bd_mutex */
3644  };
3645 +
3646 +#ifndef CONFIG_PREEMPT_RT_BASE
3647 +static inline void zram_lock_table(struct zram_table_entry *table)
3648 +{
3649 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3650 +}
3651 +
3652 +static inline void zram_unlock_table(struct zram_table_entry *table)
3653 +{
3654 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3655 +}
3656 +
3657 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3658 +#else /* CONFIG_PREEMPT_RT_BASE */
3659 +static inline void zram_lock_table(struct zram_table_entry *table)
3660 +{
3661 +       spin_lock(&table->lock);
3662 +       __set_bit(ZRAM_ACCESS, &table->value);
3663 +}
3664 +
3665 +static inline void zram_unlock_table(struct zram_table_entry *table)
3666 +{
3667 +       __clear_bit(ZRAM_ACCESS, &table->value);
3668 +       spin_unlock(&table->lock);
3669 +}
3670 +
3671 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3672 +{
3673 +        size_t num_pages = disksize >> PAGE_SHIFT;
3674 +        size_t index;
3675 +
3676 +        for (index = 0; index < num_pages; index++) {
3677 +               spinlock_t *lock = &meta->table[index].lock;
3678 +               spin_lock_init(lock);
3679 +        }
3680 +}
3681 +#endif /* CONFIG_PREEMPT_RT_BASE */
3682 +
3683  #endif
3684 diff --git a/drivers/char/random.c b/drivers/char/random.c
3685 index 08d1dd58c0d2..25ee319dc8e3 100644
3686 --- a/drivers/char/random.c
3687 +++ b/drivers/char/random.c
3688 @@ -262,6 +262,7 @@
3689  #include <linux/syscalls.h>
3690  #include <linux/completion.h>
3691  #include <linux/uuid.h>
3692 +#include <linux/locallock.h>
3693  #include <crypto/chacha20.h>
3694
3695  #include <asm/processor.h>
3696 @@ -1028,8 +1029,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3697         } sample;
3698         long delta, delta2, delta3;
3699
3700 -       preempt_disable();
3701 -
3702         sample.jiffies = jiffies;
3703         sample.cycles = random_get_entropy();
3704         sample.num = num;
3705 @@ -1070,7 +1069,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3706                  */
3707                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3708         }
3709 -       preempt_enable();
3710  }
3711
3712  void add_input_randomness(unsigned int type, unsigned int code,
3713 @@ -1123,28 +1121,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3714         return *(ptr + f->reg_idx++);
3715  }
3716
3717 -void add_interrupt_randomness(int irq, int irq_flags)
3718 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3719  {
3720         struct entropy_store    *r;
3721         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3722 -       struct pt_regs          *regs = get_irq_regs();
3723         unsigned long           now = jiffies;
3724         cycles_t                cycles = random_get_entropy();
3725         __u32                   c_high, j_high;
3726 -       __u64                   ip;
3727         unsigned long           seed;
3728         int                     credit = 0;
3729
3730         if (cycles == 0)
3731 -               cycles = get_reg(fast_pool, regs);
3732 +               cycles = get_reg(fast_pool, NULL);
3733         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3734         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3735         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3736         fast_pool->pool[1] ^= now ^ c_high;
3737 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3738 +       if (!ip)
3739 +               ip = _RET_IP_;
3740         fast_pool->pool[2] ^= ip;
3741         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3742 -               get_reg(fast_pool, regs);
3743 +               get_reg(fast_pool, NULL);
3744
3745         fast_mix(fast_pool);
3746         add_interrupt_bench(cycles);
3747 @@ -2056,6 +2053,7 @@ struct batched_entropy {
3748   * goal of being quite fast and not depleting entropy.
3749   */
3750  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_long);
3751 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_long_lock);
3752  unsigned long get_random_long(void)
3753  {
3754         unsigned long ret;
3755 @@ -2064,13 +2062,13 @@ unsigned long get_random_long(void)
3756         if (arch_get_random_long(&ret))
3757                 return ret;
3758
3759 -       batch = &get_cpu_var(batched_entropy_long);
3760 +       batch = &get_locked_var(batched_entropy_long_lock, batched_entropy_long);
3761         if (batch->position % ARRAY_SIZE(batch->entropy_long) == 0) {
3762                 extract_crng((u8 *)batch->entropy_long);
3763                 batch->position = 0;
3764         }
3765         ret = batch->entropy_long[batch->position++];
3766 -       put_cpu_var(batched_entropy_long);
3767 +       put_locked_var(batched_entropy_long_lock, batched_entropy_long);
3768         return ret;
3769  }
3770  EXPORT_SYMBOL(get_random_long);
3771 @@ -2082,6 +2080,8 @@ unsigned int get_random_int(void)
3772  }
3773  #else
3774  static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_int);
3775 +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_int_lock);
3776 +
3777  unsigned int get_random_int(void)
3778  {
3779         unsigned int ret;
3780 @@ -2090,13 +2090,13 @@ unsigned int get_random_int(void)
3781         if (arch_get_random_int(&ret))
3782                 return ret;
3783
3784 -       batch = &get_cpu_var(batched_entropy_int);
3785 +       batch = &get_locked_var(batched_entropy_int_lock, batched_entropy_int);
3786         if (batch->position % ARRAY_SIZE(batch->entropy_int) == 0) {
3787                 extract_crng((u8 *)batch->entropy_int);
3788                 batch->position = 0;
3789         }
3790         ret = batch->entropy_int[batch->position++];
3791 -       put_cpu_var(batched_entropy_int);
3792 +       put_locked_var(batched_entropy_int_lock, batched_entropy_int);
3793         return ret;
3794  }
3795  #endif
3796 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3797 index 4da2af9694a2..5b6f57f500b8 100644
3798 --- a/drivers/clocksource/tcb_clksrc.c
3799 +++ b/drivers/clocksource/tcb_clksrc.c
3800 @@ -23,8 +23,7 @@
3801   *     this 32 bit free-running counter. the second channel is not used.
3802   *
3803   *   - The third channel may be used to provide a 16-bit clockevent
3804 - *     source, used in either periodic or oneshot mode.  This runs
3805 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3806 + *     source, used in either periodic or oneshot mode.
3807   *
3808   * A boot clocksource and clockevent source are also currently needed,
3809   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3810 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3811  struct tc_clkevt_device {
3812         struct clock_event_device       clkevt;
3813         struct clk                      *clk;
3814 +       bool                            clk_enabled;
3815 +       u32                             freq;
3816         void __iomem                    *regs;
3817  };
3818
3819 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3820         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3821  }
3822
3823 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3824 - * because using one of the divided clocks would usually mean the
3825 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3826 - *
3827 - * A divided clock could be good for high resolution timers, since
3828 - * 30.5 usec resolution can seem "low".
3829 - */
3830  static u32 timer_clock;
3831
3832 +static void tc_clk_disable(struct clock_event_device *d)
3833 +{
3834 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3835 +
3836 +       clk_disable(tcd->clk);
3837 +       tcd->clk_enabled = false;
3838 +}
3839 +
3840 +static void tc_clk_enable(struct clock_event_device *d)
3841 +{
3842 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3843 +
3844 +       if (tcd->clk_enabled)
3845 +               return;
3846 +       clk_enable(tcd->clk);
3847 +       tcd->clk_enabled = true;
3848 +}
3849 +
3850  static int tc_shutdown(struct clock_event_device *d)
3851  {
3852         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3853 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3854
3855         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3856         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3857 +       return 0;
3858 +}
3859 +
3860 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3861 +{
3862 +       tc_shutdown(d);
3863         if (!clockevent_state_detached(d))
3864 -               clk_disable(tcd->clk);
3865 +               tc_clk_disable(d);
3866
3867         return 0;
3868  }
3869 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3870         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3871                 tc_shutdown(d);
3872
3873 -       clk_enable(tcd->clk);
3874 +       tc_clk_enable(d);
3875
3876 -       /* slow clock, count up to RC, then irq and stop */
3877 +       /* count up to RC, then irq and stop */
3878         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3879                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3880         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3881 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3882         /* By not making the gentime core emulate periodic mode on top
3883          * of oneshot, we get lower overhead and improved accuracy.
3884          */
3885 -       clk_enable(tcd->clk);
3886 +       tc_clk_enable(d);
3887
3888 -       /* slow clock, count up to RC, then irq and restart */
3889 +       /* count up to RC, then irq and restart */
3890         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3891                      regs + ATMEL_TC_REG(2, CMR));
3892 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3893 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3894
3895         /* Enable clock and interrupts on RC compare */
3896         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3897 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3898                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3899                                           CLOCK_EVT_FEAT_ONESHOT,
3900                 /* Should be lower than at91rm9200's system timer */
3901 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3902                 .rating                 = 125,
3903 +#else
3904 +               .rating                 = 200,
3905 +#endif
3906                 .set_next_event         = tc_next_event,
3907 -               .set_state_shutdown     = tc_shutdown,
3908 +               .set_state_shutdown     = tc_shutdown_clk_off,
3909                 .set_state_periodic     = tc_set_periodic,
3910                 .set_state_oneshot      = tc_set_oneshot,
3911         },
3912 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3913         return IRQ_NONE;
3914  }
3915
3916 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3917 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3918  {
3919 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3920         int ret;
3921         struct clk *t2_clk = tc->clk[2];
3922         int irq = tc->irq[2];
3923 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3924         clkevt.regs = tc->regs;
3925         clkevt.clk = t2_clk;
3926
3927 -       timer_clock = clk32k_divisor_idx;
3928 +       timer_clock = divisor_idx;
3929 +       if (!divisor)
3930 +               clkevt.freq = 32768;
3931 +       else
3932 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3933
3934         clkevt.clkevt.cpumask = cpumask_of(0);
3935
3936 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3937                 return ret;
3938         }
3939
3940 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3941 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3942
3943         return ret;
3944  }
3945 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
3946                 goto err_disable_t1;
3947
3948         /* channel 2:  periodic and oneshot timer support */
3949 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3950         ret = setup_clkevents(tc, clk32k_divisor_idx);
3951 +#else
3952 +       ret = setup_clkevents(tc, best_divisor_idx);
3953 +#endif
3954         if (ret)
3955                 goto err_unregister_clksrc;
3956
3957 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
3958 index 6555821bbdae..93288849b2bd 100644
3959 --- a/drivers/clocksource/timer-atmel-pit.c
3960 +++ b/drivers/clocksource/timer-atmel-pit.c
3961 @@ -46,6 +46,7 @@ struct pit_data {
3962         u32             cycle;
3963         u32             cnt;
3964         unsigned int    irq;
3965 +       bool            irq_requested;
3966         struct clk      *mck;
3967  };
3968
3969 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
3970
3971         /* disable irq, leaving the clocksource active */
3972         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3973 +       if (data->irq_requested) {
3974 +               free_irq(data->irq, data);
3975 +               data->irq_requested = false;
3976 +       }
3977         return 0;
3978  }
3979
3980 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3981  /*
3982   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
3983   */
3984  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3985  {
3986         struct pit_data *data = clkevt_to_pit_data(dev);
3987 +       int ret;
3988 +
3989 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3990 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3991 +                         "at91_tick", data);
3992 +       if (ret)
3993 +               panic(pr_fmt("Unable to setup IRQ\n"));
3994 +
3995 +       data->irq_requested = true;
3996
3997         /* update clocksource counter */
3998         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3999 @@ -230,15 +245,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
4000                 return ret;
4001         }
4002
4003 -       /* Set up irq handler */
4004 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4005 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4006 -                         "at91_tick", data);
4007 -       if (ret) {
4008 -               pr_err("Unable to setup IRQ\n");
4009 -               return ret;
4010 -       }
4011 -
4012         /* Set up and register clockevents */
4013         data->clkevt.name = "pit";
4014         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
4015 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
4016 index e90ab5b63a90..9e124087c55f 100644
4017 --- a/drivers/clocksource/timer-atmel-st.c
4018 +++ b/drivers/clocksource/timer-atmel-st.c
4019 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
4020         last_crtr = read_CRTR();
4021  }
4022
4023 +static int atmel_st_irq;
4024 +
4025  static int clkevt32k_shutdown(struct clock_event_device *evt)
4026  {
4027         clkdev32k_disable_and_flush_irq();
4028         irqmask = 0;
4029         regmap_write(regmap_st, AT91_ST_IER, irqmask);
4030 +       free_irq(atmel_st_irq, regmap_st);
4031         return 0;
4032  }
4033
4034  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4035  {
4036 +       int ret;
4037 +
4038         clkdev32k_disable_and_flush_irq();
4039
4040 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4041 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4042 +                         "at91_tick", regmap_st);
4043 +       if (ret)
4044 +               panic(pr_fmt("Unable to setup IRQ\n"));
4045 +
4046         /*
4047          * ALM for oneshot irqs, set by next_event()
4048          * before 32 seconds have passed.
4049 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4050
4051  static int clkevt32k_set_periodic(struct clock_event_device *dev)
4052  {
4053 +       int ret;
4054 +
4055         clkdev32k_disable_and_flush_irq();
4056
4057 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4058 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4059 +                         "at91_tick", regmap_st);
4060 +       if (ret)
4061 +               panic(pr_fmt("Unable to setup IRQ\n"));
4062 +
4063         /* PIT for periodic irqs; fixed rate of 1/HZ */
4064         irqmask = AT91_ST_PITS;
4065         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
4066 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
4067  {
4068         struct clk *sclk;
4069         unsigned int sclk_rate, val;
4070 -       int irq, ret;
4071 +       int ret;
4072
4073         regmap_st = syscon_node_to_regmap(node);
4074         if (IS_ERR(regmap_st)) {
4075 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
4076         regmap_read(regmap_st, AT91_ST_SR, &val);
4077
4078         /* Get the interrupts property */
4079 -       irq  = irq_of_parse_and_map(node, 0);
4080 -       if (!irq) {
4081 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
4082 +       if (!atmel_st_irq) {
4083                 pr_err("Unable to get IRQ from DT\n");
4084                 return -EINVAL;
4085         }
4086
4087 -       /* Make IRQs happen for the system timer */
4088 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
4089 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4090 -                         "at91_tick", regmap_st);
4091 -       if (ret) {
4092 -               pr_err("Unable to setup IRQ\n");
4093 -               return ret;
4094 -       }
4095 -
4096         sclk = of_clk_get(node, 0);
4097         if (IS_ERR(sclk)) {
4098                 pr_err("Unable to get slow clock\n");
4099 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
4100 index a782ce87715c..19d265948526 100644
4101 --- a/drivers/connector/cn_proc.c
4102 +++ b/drivers/connector/cn_proc.c
4103 @@ -32,6 +32,7 @@
4104  #include <linux/pid_namespace.h>
4105
4106  #include <linux/cn_proc.h>
4107 +#include <linux/locallock.h>
4108
4109  /*
4110   * Size of a cn_msg followed by a proc_event structure.  Since the
4111 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4112
4113  /* proc_event_counts is used as the sequence number of the netlink message */
4114  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4115 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4116
4117  static inline void send_msg(struct cn_msg *msg)
4118  {
4119 -       preempt_disable();
4120 +       local_lock(send_msg_lock);
4121
4122         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4123         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4124 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4125          */
4126         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4127
4128 -       preempt_enable();
4129 +       local_unlock(send_msg_lock);
4130  }
4131
4132  void proc_fork_connector(struct task_struct *task)
4133 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4134 index adbd1de1cea5..1fac5074f2cf 100644
4135 --- a/drivers/cpufreq/Kconfig.x86
4136 +++ b/drivers/cpufreq/Kconfig.x86
4137 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4138
4139  config X86_POWERNOW_K8
4140         tristate "AMD Opteron/Athlon64 PowerNow!"
4141 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4142 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4143         help
4144           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4145           Support for K10 and newer processors is now in acpi-cpufreq.
4146 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4147 index 2117f172d7a2..96c15501b0c8 100644
4148 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4149 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4150 @@ -1489,7 +1489,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
4151         if (ret)
4152                 return ret;
4153
4154 +#ifndef CONFIG_PREEMPT_RT_BASE
4155         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4156 +#endif
4157
4158         i915_gem_execbuffer_move_to_active(vmas, params->request);
4159
4160 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4161 index 755d78832a66..97fb03dc4971 100644
4162 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4163 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4164 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4165         if (!mutex_is_locked(mutex))
4166                 return false;
4167
4168 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4169 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4170         return mutex->owner == task;
4171  #else
4172         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4173 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4174 index 02908e37c228..05c0480576e1 100644
4175 --- a/drivers/gpu/drm/i915/i915_irq.c
4176 +++ b/drivers/gpu/drm/i915/i915_irq.c
4177 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4178         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4179
4180         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4181 +       preempt_disable_rt();
4182
4183         /* Get optional system timestamp before query. */
4184         if (stime)
4185 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4186                 *etime = ktime_get();
4187
4188         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4189 +       preempt_enable_rt();
4190
4191         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4192
4193 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4194 index 5dc6082639db..c32458fb3be2 100644
4195 --- a/drivers/gpu/drm/i915/intel_display.c
4196 +++ b/drivers/gpu/drm/i915/intel_display.c
4197 @@ -12131,7 +12131,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4198         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4199         struct intel_flip_work *work;
4200
4201 -       WARN_ON(!in_interrupt());
4202 +       WARN_ON_NONRT(!in_interrupt());
4203
4204         if (crtc == NULL)
4205                 return;
4206 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4207 index dbed12c484c9..5c540b78e8b5 100644
4208 --- a/drivers/gpu/drm/i915/intel_sprite.c
4209 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4210 @@ -35,6 +35,7 @@
4211  #include <drm/drm_rect.h>
4212  #include <drm/drm_atomic.h>
4213  #include <drm/drm_plane_helper.h>
4214 +#include <linux/locallock.h>
4215  #include "intel_drv.h"
4216  #include "intel_frontbuffer.h"
4217  #include <drm/i915_drm.h>
4218 @@ -65,6 +66,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4219                             1000 * adjusted_mode->crtc_htotal);
4220  }
4221
4222 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4223 +
4224  /**
4225   * intel_pipe_update_start() - start update of a set of display registers
4226   * @crtc: the crtc of which the registers are going to be updated
4227 @@ -95,7 +98,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4228         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4229         max = vblank_start - 1;
4230
4231 -       local_irq_disable();
4232 +       local_lock_irq(pipe_update_lock);
4233
4234         if (min <= 0 || max <= 0)
4235                 return;
4236 @@ -125,11 +128,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4237                         break;
4238                 }
4239
4240 -               local_irq_enable();
4241 +               local_unlock_irq(pipe_update_lock);
4242
4243                 timeout = schedule_timeout(timeout);
4244
4245 -               local_irq_disable();
4246 +               local_lock_irq(pipe_update_lock);
4247         }
4248
4249         finish_wait(wq, &wait);
4250 @@ -181,7 +184,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4251                 crtc->base.state->event = NULL;
4252         }
4253
4254 -       local_irq_enable();
4255 +       local_unlock_irq(pipe_update_lock);
4256
4257         if (crtc->debug.start_vbl_count &&
4258             crtc->debug.start_vbl_count != end_vbl_count) {
4259 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4260 index 192b2d3a79cb..d5372a207326 100644
4261 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4262 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4263 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4264         if (!mutex_is_locked(mutex))
4265                 return false;
4266
4267 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4268 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4269         return mutex->owner == task;
4270  #else
4271         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4272 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4273 index cdb8cb568c15..b6d7fd964cbc 100644
4274 --- a/drivers/gpu/drm/radeon/radeon_display.c
4275 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4276 @@ -1845,6 +1845,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4277         struct radeon_device *rdev = dev->dev_private;
4278
4279         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4280 +       preempt_disable_rt();
4281
4282         /* Get optional system timestamp before query. */
4283         if (stime)
4284 @@ -1937,6 +1938,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4285                 *etime = ktime_get();
4286
4287         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4288 +       preempt_enable_rt();
4289
4290         /* Decode into vertical and horizontal scanout position. */
4291         *vpos = position & 0x1fff;
4292 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4293 index 0276d2ef06ee..8868045eabde 100644
4294 --- a/drivers/hv/vmbus_drv.c
4295 +++ b/drivers/hv/vmbus_drv.c
4296 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4297         void *page_addr;
4298         struct hv_message *msg;
4299         union hv_synic_event_flags *event;
4300 +       struct pt_regs *regs = get_irq_regs();
4301 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4302         bool handled = false;
4303
4304         page_addr = hv_context.synic_event_page[cpu];
4305 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4306                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4307         }
4308
4309 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4310 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4311  }
4312
4313
4314 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4315 index 36f76e28a0bf..394f142f90c7 100644
4316 --- a/drivers/ide/alim15x3.c
4317 +++ b/drivers/ide/alim15x3.c
4318 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4319
4320         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4321
4322 -       local_irq_save(flags);
4323 +       local_irq_save_nort(flags);
4324
4325         if (m5229_revision < 0xC2) {
4326                 /*
4327 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4328         }
4329         pci_dev_put(north);
4330         pci_dev_put(isa_dev);
4331 -       local_irq_restore(flags);
4332 +       local_irq_restore_nort(flags);
4333         return 0;
4334  }
4335
4336 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4337 index 0ceae5cbd89a..c212e85d7f3e 100644
4338 --- a/drivers/ide/hpt366.c
4339 +++ b/drivers/ide/hpt366.c
4340 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4341
4342         dma_old = inb(base + 2);
4343
4344 -       local_irq_save(flags);
4345 +       local_irq_save_nort(flags);
4346
4347         dma_new = dma_old;
4348         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4349 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4350         if (dma_new != dma_old)
4351                 outb(dma_new, base + 2);
4352
4353 -       local_irq_restore(flags);
4354 +       local_irq_restore_nort(flags);
4355
4356         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4357                          hwif->name, base, base + 7);
4358 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4359 index 19763977568c..4169433faab5 100644
4360 --- a/drivers/ide/ide-io-std.c
4361 +++ b/drivers/ide/ide-io-std.c
4362 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4363                 unsigned long uninitialized_var(flags);
4364
4365                 if ((io_32bit & 2) && !mmio) {
4366 -                       local_irq_save(flags);
4367 +                       local_irq_save_nort(flags);
4368                         ata_vlb_sync(io_ports->nsect_addr);
4369                 }
4370
4371 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4372                         insl(data_addr, buf, words);
4373
4374                 if ((io_32bit & 2) && !mmio)
4375 -                       local_irq_restore(flags);
4376 +                       local_irq_restore_nort(flags);
4377
4378                 if (((len + 1) & 3) < 2)
4379                         return;
4380 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4381                 unsigned long uninitialized_var(flags);
4382
4383                 if ((io_32bit & 2) && !mmio) {
4384 -                       local_irq_save(flags);
4385 +                       local_irq_save_nort(flags);
4386                         ata_vlb_sync(io_ports->nsect_addr);
4387                 }
4388
4389 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4390                         outsl(data_addr, buf, words);
4391
4392                 if ((io_32bit & 2) && !mmio)
4393 -                       local_irq_restore(flags);
4394 +                       local_irq_restore_nort(flags);
4395
4396                 if (((len + 1) & 3) < 2)
4397                         return;
4398 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4399 index 669ea1e45795..e12e43e62245 100644
4400 --- a/drivers/ide/ide-io.c
4401 +++ b/drivers/ide/ide-io.c
4402 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4403                 /* disable_irq_nosync ?? */
4404                 disable_irq(hwif->irq);
4405                 /* local CPU only, as if we were handling an interrupt */
4406 -               local_irq_disable();
4407 +               local_irq_disable_nort();
4408                 if (hwif->polling) {
4409                         startstop = handler(drive);
4410                 } else if (drive_is_ready(drive)) {
4411 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4412 index 376f2dc410c5..f014dd1b73dc 100644
4413 --- a/drivers/ide/ide-iops.c
4414 +++ b/drivers/ide/ide-iops.c
4415 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4416                                 if ((stat & ATA_BUSY) == 0)
4417                                         break;
4418
4419 -                               local_irq_restore(flags);
4420 +                               local_irq_restore_nort(flags);
4421                                 *rstat = stat;
4422                                 return -EBUSY;
4423                         }
4424                 }
4425 -               local_irq_restore(flags);
4426 +               local_irq_restore_nort(flags);
4427         }
4428         /*
4429          * Allow status to settle, then read it again.
4430 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4431 index 0b63facd1d87..4ceba37afc0c 100644
4432 --- a/drivers/ide/ide-probe.c
4433 +++ b/drivers/ide/ide-probe.c
4434 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4435         int bswap = 1;
4436
4437         /* local CPU only; some systems need this */
4438 -       local_irq_save(flags);
4439 +       local_irq_save_nort(flags);
4440         /* read 512 bytes of id info */
4441         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4442 -       local_irq_restore(flags);
4443 +       local_irq_restore_nort(flags);
4444
4445         drive->dev_flags |= IDE_DFLAG_ID_READ;
4446  #ifdef DEBUG
4447 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4448 index a716693417a3..be0568c722d6 100644
4449 --- a/drivers/ide/ide-taskfile.c
4450 +++ b/drivers/ide/ide-taskfile.c
4451 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4452
4453                 page_is_high = PageHighMem(page);
4454                 if (page_is_high)
4455 -                       local_irq_save(flags);
4456 +                       local_irq_save_nort(flags);
4457
4458                 buf = kmap_atomic(page) + offset;
4459
4460 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4461                 kunmap_atomic(buf);
4462
4463                 if (page_is_high)
4464 -                       local_irq_restore(flags);
4465 +                       local_irq_restore_nort(flags);
4466
4467                 len -= nr_bytes;
4468         }
4469 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4470         }
4471
4472         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4473 -               local_irq_disable();
4474 +               local_irq_disable_nort();
4475
4476         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4477
4478 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4479 index fddff403d5d2..cca1bb4fbfe3 100644
4480 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4481 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4482 @@ -902,7 +902,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4483
4484         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4485
4486 -       local_irq_save(flags);
4487 +       local_irq_save_nort(flags);
4488         netif_addr_lock(dev);
4489         spin_lock(&priv->lock);
4490
4491 @@ -984,7 +984,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4492
4493         spin_unlock(&priv->lock);
4494         netif_addr_unlock(dev);
4495 -       local_irq_restore(flags);
4496 +       local_irq_restore_nort(flags);
4497
4498         /*
4499          * make sure the in-flight joins have finished before we attempt
4500 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4501 index 4a2a9e370be7..e970d9afd179 100644
4502 --- a/drivers/input/gameport/gameport.c
4503 +++ b/drivers/input/gameport/gameport.c
4504 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4505         tx = ~0;
4506
4507         for (i = 0; i < 50; i++) {
4508 -               local_irq_save(flags);
4509 +               local_irq_save_nort(flags);
4510                 t1 = ktime_get_ns();
4511                 for (t = 0; t < 50; t++)
4512                         gameport_read(gameport);
4513                 t2 = ktime_get_ns();
4514                 t3 = ktime_get_ns();
4515 -               local_irq_restore(flags);
4516 +               local_irq_restore_nort(flags);
4517                 udelay(i * 10);
4518                 t = (t2 - t1) - (t3 - t2);
4519                 if (t < tx)
4520 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4521         tx = 1 << 30;
4522
4523         for(i = 0; i < 50; i++) {
4524 -               local_irq_save(flags);
4525 +               local_irq_save_nort(flags);
4526                 GET_TIME(t1);
4527                 for (t = 0; t < 50; t++) gameport_read(gameport);
4528                 GET_TIME(t2);
4529                 GET_TIME(t3);
4530 -               local_irq_restore(flags);
4531 +               local_irq_restore_nort(flags);
4532                 udelay(i * 10);
4533                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4534         }
4535 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4536         tx = 1 << 30;
4537
4538         for(i = 0; i < 50; i++) {
4539 -               local_irq_save(flags);
4540 +               local_irq_save_nort(flags);
4541                 t1 = rdtsc();
4542                 for (t = 0; t < 50; t++) gameport_read(gameport);
4543                 t2 = rdtsc();
4544 -               local_irq_restore(flags);
4545 +               local_irq_restore_nort(flags);
4546                 udelay(i * 10);
4547                 if (t2 - t1 < tx) tx = t2 - t1;
4548         }
4549 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4550 index 11a13b5be73a..baaed0ac274b 100644
4551 --- a/drivers/iommu/amd_iommu.c
4552 +++ b/drivers/iommu/amd_iommu.c
4553 @@ -1923,10 +1923,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4554         int ret;
4555
4556         /*
4557 -        * Must be called with IRQs disabled. Warn here to detect early
4558 -        * when its not.
4559 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4560 +        * detect early when its not.
4561          */
4562 -       WARN_ON(!irqs_disabled());
4563 +       WARN_ON_NONRT(!irqs_disabled());
4564
4565         /* lock domain */
4566         spin_lock(&domain->lock);
4567 @@ -2094,10 +2094,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4568         struct protection_domain *domain;
4569
4570         /*
4571 -        * Must be called with IRQs disabled. Warn here to detect early
4572 -        * when its not.
4573 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4574 +        * detect early when its not.
4575          */
4576 -       WARN_ON(!irqs_disabled());
4577 +       WARN_ON_NONRT(!irqs_disabled());
4578
4579         if (WARN_ON(!dev_data->domain))
4580                 return;
4581 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4582 index b9e50c10213b..fd3b4657723f 100644
4583 --- a/drivers/iommu/intel-iommu.c
4584 +++ b/drivers/iommu/intel-iommu.c
4585 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4586         struct deferred_flush_table *tables;
4587  };
4588
4589 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4590 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4591
4592  /* bitmap for indexing intel_iommus */
4593  static int g_num_of_iommus;
4594 @@ -3716,10 +3716,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4595         struct intel_iommu *iommu;
4596         struct deferred_flush_entry *entry;
4597         struct deferred_flush_data *flush_data;
4598 -       unsigned int cpuid;
4599
4600 -       cpuid = get_cpu();
4601 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4602 +       flush_data = raw_cpu_ptr(&deferred_flush);
4603
4604         /* Flush all CPUs' entries to avoid deferring too much.  If
4605          * this becomes a bottleneck, can just flush us, and rely on
4606 @@ -3752,8 +3750,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4607         }
4608         flush_data->size++;
4609         spin_unlock_irqrestore(&flush_data->lock, flags);
4610 -
4611 -       put_cpu();
4612  }
4613
4614  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4615 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4616 index e23001bfcfee..359d5d169ec0 100644
4617 --- a/drivers/iommu/iova.c
4618 +++ b/drivers/iommu/iova.c
4619 @@ -22,6 +22,7 @@
4620  #include <linux/slab.h>
4621  #include <linux/smp.h>
4622  #include <linux/bitops.h>
4623 +#include <linux/cpu.h>
4624
4625  static bool iova_rcache_insert(struct iova_domain *iovad,
4626                                unsigned long pfn,
4627 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4628
4629                 /* Try replenishing IOVAs by flushing rcache. */
4630                 flushed_rcache = true;
4631 -               preempt_disable();
4632                 for_each_online_cpu(cpu)
4633                         free_cpu_cached_iovas(cpu, iovad);
4634 -               preempt_enable();
4635                 goto retry;
4636         }
4637
4638 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4639         bool can_insert = false;
4640         unsigned long flags;
4641
4642 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4643 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4644         spin_lock_irqsave(&cpu_rcache->lock, flags);
4645
4646         if (!iova_magazine_full(cpu_rcache->loaded)) {
4647 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4648                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4649
4650         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4651 -       put_cpu_ptr(rcache->cpu_rcaches);
4652
4653         if (mag_to_free) {
4654                 iova_magazine_free_pfns(mag_to_free, iovad);
4655 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4656         bool has_pfn = false;
4657         unsigned long flags;
4658
4659 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4660 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4661         spin_lock_irqsave(&cpu_rcache->lock, flags);
4662
4663         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4664 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4665                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4666
4667         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4668 -       put_cpu_ptr(rcache->cpu_rcaches);
4669
4670         return iova_pfn;
4671  }
4672 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4673 index 3f9ddb9fafa7..09da5b6b44a1 100644
4674 --- a/drivers/leds/trigger/Kconfig
4675 +++ b/drivers/leds/trigger/Kconfig
4676 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4677
4678  config LEDS_TRIGGER_CPU
4679         bool "LED CPU Trigger"
4680 -       depends on LEDS_TRIGGERS
4681 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4682         help
4683           This allows LEDs to be controlled by active CPUs. This shows
4684           the active CPUs across an array of LEDs so you can see which
4685 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4686 index 4d200883c505..98b64ed5cb81 100644
4687 --- a/drivers/md/bcache/Kconfig
4688 +++ b/drivers/md/bcache/Kconfig
4689 @@ -1,6 +1,7 @@
4690
4691  config BCACHE
4692         tristate "Block device as cache"
4693 +       depends on !PREEMPT_RT_FULL
4694         ---help---
4695         Allows a block device to be used as cache for other devices; uses
4696         a btree for indexing and the layout is optimized for SSDs.
4697 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4698 index 2c965424d383..2c8877f50626 100644
4699 --- a/drivers/md/dm-rq.c
4700 +++ b/drivers/md/dm-rq.c
4701 @@ -842,7 +842,7 @@ static void dm_old_request_fn(struct request_queue *q)
4702                 /* Establish tio->ti before queuing work (map_tio_request) */
4703                 tio->ti = ti;
4704                 kthread_queue_work(&md->kworker, &tio->work);
4705 -               BUG_ON(!irqs_disabled());
4706 +               BUG_ON_NONRT(!irqs_disabled());
4707         }
4708  }
4709
4710 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4711 index cce6057b9aca..fa2c4de32a64 100644
4712 --- a/drivers/md/raid5.c
4713 +++ b/drivers/md/raid5.c
4714 @@ -1928,8 +1928,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4715         struct raid5_percpu *percpu;
4716         unsigned long cpu;
4717
4718 -       cpu = get_cpu();
4719 +       cpu = get_cpu_light();
4720         percpu = per_cpu_ptr(conf->percpu, cpu);
4721 +       spin_lock(&percpu->lock);
4722         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4723                 ops_run_biofill(sh);
4724                 overlap_clear++;
4725 @@ -1985,7 +1986,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4726                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4727                                 wake_up(&sh->raid_conf->wait_for_overlap);
4728                 }
4729 -       put_cpu();
4730 +       spin_unlock(&percpu->lock);
4731 +       put_cpu_light();
4732  }
4733
4734  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4735 @@ -6391,6 +6393,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
4736                        __func__, cpu);
4737                 return -ENOMEM;
4738         }
4739 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4740         return 0;
4741  }
4742
4743 @@ -6401,7 +6404,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4744         conf->percpu = alloc_percpu(struct raid5_percpu);
4745         if (!conf->percpu)
4746                 return -ENOMEM;
4747 -
4748         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4749         if (!err) {
4750                 conf->scribble_disks = max(conf->raid_disks,
4751 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4752 index 57ec49f0839e..0739604990b7 100644
4753 --- a/drivers/md/raid5.h
4754 +++ b/drivers/md/raid5.h
4755 @@ -504,6 +504,7 @@ struct r5conf {
4756         int                     recovery_disabled;
4757         /* per cpu variables */
4758         struct raid5_percpu {
4759 +               spinlock_t      lock;           /* Protection for -RT */
4760                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4761                 struct flex_array *scribble;   /* space for constructing buffer
4762                                               * lists and performing address
4763 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4764 index 64971baf11fa..215e91e36198 100644
4765 --- a/drivers/misc/Kconfig
4766 +++ b/drivers/misc/Kconfig
4767 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4768  config ATMEL_TCLIB
4769         bool "Atmel AT32/AT91 Timer/Counter Library"
4770         depends on (AVR32 || ARCH_AT91)
4771 +       default y if PREEMPT_RT_FULL
4772         help
4773           Select this if you want a library to allocate the Timer/Counter
4774           blocks found on many Atmel processors.  This facilitates using
4775 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4776           are combined to make a single 32-bit timer.
4777
4778           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4779 -         may be used as a clock event device supporting oneshot mode
4780 -         (delays of up to two seconds) based on the 32 KiHz clock.
4781 +         may be used as a clock event device supporting oneshot mode.
4782
4783  config ATMEL_TCB_CLKSRC_BLOCK
4784         int
4785 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4786           TC can be used for other purposes, such as PWM generation and
4787           interval timing.
4788
4789 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4790 +       bool "TC Block use 32 KiHz clock"
4791 +       depends on ATMEL_TCB_CLKSRC
4792 +       default y if !PREEMPT_RT_FULL
4793 +       help
4794 +         Select this to use 32 KiHz base clock rate as TC block clock
4795 +         source for clock events.
4796 +
4797 +
4798  config DUMMY_IRQ
4799         tristate "Dummy IRQ handler"
4800         default n
4801 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
4802 index df990bb8c873..1a162709a85e 100644
4803 --- a/drivers/mmc/host/mmci.c
4804 +++ b/drivers/mmc/host/mmci.c
4805 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4806         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4807         struct variant_data *variant = host->variant;
4808         void __iomem *base = host->base;
4809 -       unsigned long flags;
4810         u32 status;
4811
4812         status = readl(base + MMCISTATUS);
4813
4814         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4815
4816 -       local_irq_save(flags);
4817 -
4818         do {
4819                 unsigned int remain, len;
4820                 char *buffer;
4821 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4822
4823         sg_miter_stop(sg_miter);
4824
4825 -       local_irq_restore(flags);
4826 -
4827         /*
4828          * If we have less than the fifo 'half-full' threshold to transfer,
4829          * trigger a PIO interrupt as soon as any data is available.
4830 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
4831 index 9133e7926da5..63afb921ed40 100644
4832 --- a/drivers/net/ethernet/3com/3c59x.c
4833 +++ b/drivers/net/ethernet/3com/3c59x.c
4834 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
4835  {
4836         struct vortex_private *vp = netdev_priv(dev);
4837         unsigned long flags;
4838 -       local_irq_save(flags);
4839 +       local_irq_save_nort(flags);
4840         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4841 -       local_irq_restore(flags);
4842 +       local_irq_restore_nort(flags);
4843  }
4844  #endif
4845
4846 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
4847                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4848                          */
4849                         unsigned long flags;
4850 -                       local_irq_save(flags);
4851 +                       local_irq_save_nort(flags);
4852                         if (vp->full_bus_master_tx)
4853                                 boomerang_interrupt(dev->irq, dev);
4854                         else
4855                                 vortex_interrupt(dev->irq, dev);
4856 -                       local_irq_restore(flags);
4857 +                       local_irq_restore_nort(flags);
4858                 }
4859         }
4860
4861 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
4862 index da4c2d8a4173..1420dfb56bac 100644
4863 --- a/drivers/net/ethernet/realtek/8139too.c
4864 +++ b/drivers/net/ethernet/realtek/8139too.c
4865 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
4866         struct rtl8139_private *tp = netdev_priv(dev);
4867         const int irq = tp->pci_dev->irq;
4868
4869 -       disable_irq(irq);
4870 +       disable_irq_nosync(irq);
4871         rtl8139_interrupt(irq, dev);
4872         enable_irq(irq);
4873  }
4874 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4875 index bca6935a94db..d7a35ee34d03 100644
4876 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4877 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4878 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
4879                         while (!ctx->done.done && msecs--)
4880                                 udelay(1000);
4881                 } else {
4882 -                       wait_event_interruptible(ctx->done.wait,
4883 +                       swait_event_interruptible(ctx->done.wait,
4884                                                  ctx->done.done);
4885                 }
4886                 break;
4887 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
4888 index d11cdbb8fba3..223bbb9acb03 100644
4889 --- a/drivers/pci/access.c
4890 +++ b/drivers/pci/access.c
4891 @@ -672,7 +672,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
4892         WARN_ON(!dev->block_cfg_access);
4893
4894         dev->block_cfg_access = 0;
4895 -       wake_up_all(&pci_cfg_wait);
4896 +       wake_up_all_locked(&pci_cfg_wait);
4897         raw_spin_unlock_irqrestore(&pci_lock, flags);
4898  }
4899  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4900 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
4901 index bedce3453dd3..faf038978650 100644
4902 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
4903 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
4904 @@ -61,7 +61,7 @@ struct msm_pinctrl {
4905         struct notifier_block restart_nb;
4906         int irq;
4907
4908 -       spinlock_t lock;
4909 +       raw_spinlock_t lock;
4910
4911         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
4912         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
4913 @@ -153,14 +153,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
4914         if (WARN_ON(i == g->nfuncs))
4915                 return -EINVAL;
4916
4917 -       spin_lock_irqsave(&pctrl->lock, flags);
4918 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4919
4920         val = readl(pctrl->regs + g->ctl_reg);
4921         val &= ~mask;
4922         val |= i << g->mux_bit;
4923         writel(val, pctrl->regs + g->ctl_reg);
4924
4925 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4926 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4927
4928         return 0;
4929  }
4930 @@ -323,14 +323,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
4931                         break;
4932                 case PIN_CONFIG_OUTPUT:
4933                         /* set output value */
4934 -                       spin_lock_irqsave(&pctrl->lock, flags);
4935 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
4936                         val = readl(pctrl->regs + g->io_reg);
4937                         if (arg)
4938                                 val |= BIT(g->out_bit);
4939                         else
4940                                 val &= ~BIT(g->out_bit);
4941                         writel(val, pctrl->regs + g->io_reg);
4942 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
4943 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4944
4945                         /* enable output */
4946                         arg = 1;
4947 @@ -351,12 +351,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
4948                         return -EINVAL;
4949                 }
4950
4951 -               spin_lock_irqsave(&pctrl->lock, flags);
4952 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
4953                 val = readl(pctrl->regs + g->ctl_reg);
4954                 val &= ~(mask << bit);
4955                 val |= arg << bit;
4956                 writel(val, pctrl->regs + g->ctl_reg);
4957 -               spin_unlock_irqrestore(&pctrl->lock, flags);
4958 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4959         }
4960
4961         return 0;
4962 @@ -384,13 +384,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
4963
4964         g = &pctrl->soc->groups[offset];
4965
4966 -       spin_lock_irqsave(&pctrl->lock, flags);
4967 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4968
4969         val = readl(pctrl->regs + g->ctl_reg);
4970         val &= ~BIT(g->oe_bit);
4971         writel(val, pctrl->regs + g->ctl_reg);
4972
4973 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4974 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4975
4976         return 0;
4977  }
4978 @@ -404,7 +404,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
4979
4980         g = &pctrl->soc->groups[offset];
4981
4982 -       spin_lock_irqsave(&pctrl->lock, flags);
4983 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4984
4985         val = readl(pctrl->regs + g->io_reg);
4986         if (value)
4987 @@ -417,7 +417,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
4988         val |= BIT(g->oe_bit);
4989         writel(val, pctrl->regs + g->ctl_reg);
4990
4991 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4992 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4993
4994         return 0;
4995  }
4996 @@ -443,7 +443,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
4997
4998         g = &pctrl->soc->groups[offset];
4999
5000 -       spin_lock_irqsave(&pctrl->lock, flags);
5001 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5002
5003         val = readl(pctrl->regs + g->io_reg);
5004         if (value)
5005 @@ -452,7 +452,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
5006                 val &= ~BIT(g->out_bit);
5007         writel(val, pctrl->regs + g->io_reg);
5008
5009 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5010 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5011  }
5012
5013  #ifdef CONFIG_DEBUG_FS
5014 @@ -571,7 +571,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
5015
5016         g = &pctrl->soc->groups[d->hwirq];
5017
5018 -       spin_lock_irqsave(&pctrl->lock, flags);
5019 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5020
5021         val = readl(pctrl->regs + g->intr_cfg_reg);
5022         val &= ~BIT(g->intr_enable_bit);
5023 @@ -579,7 +579,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
5024
5025         clear_bit(d->hwirq, pctrl->enabled_irqs);
5026
5027 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5028 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5029  }
5030
5031  static void msm_gpio_irq_unmask(struct irq_data *d)
5032 @@ -592,7 +592,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
5033
5034         g = &pctrl->soc->groups[d->hwirq];
5035
5036 -       spin_lock_irqsave(&pctrl->lock, flags);
5037 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5038
5039         val = readl(pctrl->regs + g->intr_cfg_reg);
5040         val |= BIT(g->intr_enable_bit);
5041 @@ -600,7 +600,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
5042
5043         set_bit(d->hwirq, pctrl->enabled_irqs);
5044
5045 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5046 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5047  }
5048
5049  static void msm_gpio_irq_ack(struct irq_data *d)
5050 @@ -613,7 +613,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
5051
5052         g = &pctrl->soc->groups[d->hwirq];
5053
5054 -       spin_lock_irqsave(&pctrl->lock, flags);
5055 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5056
5057         val = readl(pctrl->regs + g->intr_status_reg);
5058         if (g->intr_ack_high)
5059 @@ -625,7 +625,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
5060         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
5061                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
5062
5063 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5064 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5065  }
5066
5067  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5068 @@ -638,7 +638,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5069
5070         g = &pctrl->soc->groups[d->hwirq];
5071
5072 -       spin_lock_irqsave(&pctrl->lock, flags);
5073 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5074
5075         /*
5076          * For hw without possibility of detecting both edges
5077 @@ -712,7 +712,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
5078         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
5079                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
5080
5081 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5082 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5083
5084         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
5085                 irq_set_handler_locked(d, handle_level_irq);
5086 @@ -728,11 +728,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
5087         struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
5088         unsigned long flags;
5089
5090 -       spin_lock_irqsave(&pctrl->lock, flags);
5091 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5092
5093         irq_set_irq_wake(pctrl->irq, on);
5094
5095 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5096 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5097
5098         return 0;
5099  }
5100 @@ -878,7 +878,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
5101         pctrl->soc = soc_data;
5102         pctrl->chip = msm_gpio_template;
5103
5104 -       spin_lock_init(&pctrl->lock);
5105 +       raw_spin_lock_init(&pctrl->lock);
5106
5107         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
5108         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
5109 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
5110 index 9bd41a35a78a..8e2d436c2e3f 100644
5111 --- a/drivers/scsi/fcoe/fcoe.c
5112 +++ b/drivers/scsi/fcoe/fcoe.c
5113 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
5114  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
5115  {
5116         struct fcoe_percpu_s *fps;
5117 -       int rc;
5118 +       int rc, cpu = get_cpu_light();
5119
5120 -       fps = &get_cpu_var(fcoe_percpu);
5121 +       fps = &per_cpu(fcoe_percpu, cpu);
5122         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
5123 -       put_cpu_var(fcoe_percpu);
5124 +       put_cpu_light();
5125
5126         return rc;
5127  }
5128 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
5129                 return 0;
5130         }
5131
5132 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5133 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5134         stats->InvalidCRCCount++;
5135         if (stats->InvalidCRCCount < 5)
5136                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
5137 -       put_cpu();
5138 +       put_cpu_light();
5139         return -EINVAL;
5140  }
5141
5142 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5143          */
5144         hp = (struct fcoe_hdr *) skb_network_header(skb);
5145
5146 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5147 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5148         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
5149                 if (stats->ErrorFrames < 5)
5150                         printk(KERN_WARNING "fcoe: FCoE version "
5151 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5152                 goto drop;
5153
5154         if (!fcoe_filter_frames(lport, fp)) {
5155 -               put_cpu();
5156 +               put_cpu_light();
5157                 fc_exch_recv(lport, fp);
5158                 return;
5159         }
5160  drop:
5161         stats->ErrorFrames++;
5162 -       put_cpu();
5163 +       put_cpu_light();
5164         kfree_skb(skb);
5165  }
5166
5167 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
5168 index dcf36537a767..1a1f2e46452c 100644
5169 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
5170 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
5171 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5172
5173         INIT_LIST_HEAD(&del_list);
5174
5175 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
5176 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
5177
5178         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
5179                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
5180 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5181                                 sel_time = fcf->time;
5182                 }
5183         }
5184 -       put_cpu();
5185 +       put_cpu_light();
5186
5187         list_for_each_entry_safe(fcf, next, &del_list, list) {
5188                 /* Removes fcf from current list */
5189 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
5190 index 16ca31ad5ec0..c3987347e762 100644
5191 --- a/drivers/scsi/libfc/fc_exch.c
5192 +++ b/drivers/scsi/libfc/fc_exch.c
5193 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
5194         }
5195         memset(ep, 0, sizeof(*ep));
5196
5197 -       cpu = get_cpu();
5198 +       cpu = get_cpu_light();
5199         pool = per_cpu_ptr(mp->pool, cpu);
5200         spin_lock_bh(&pool->lock);
5201 -       put_cpu();
5202 +       put_cpu_light();
5203
5204         /* peek cache of free slot */
5205         if (pool->left != FC_XID_UNKNOWN) {
5206 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
5207 index 87f5e694dbed..23c0a50fb6aa 100644
5208 --- a/drivers/scsi/libsas/sas_ata.c
5209 +++ b/drivers/scsi/libsas/sas_ata.c
5210 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5211         /* TODO: audit callers to ensure they are ready for qc_issue to
5212          * unconditionally re-enable interrupts
5213          */
5214 -       local_irq_save(flags);
5215 +       local_irq_save_nort(flags);
5216         spin_unlock(ap->lock);
5217
5218         /* If the device fell off, no sense in issuing commands */
5219 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5220
5221   out:
5222         spin_lock(ap->lock);
5223 -       local_irq_restore(flags);
5224 +       local_irq_restore_nort(flags);
5225         return ret;
5226  }
5227
5228 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
5229 index edc48f3b8230..ee5c6f9dfb6f 100644
5230 --- a/drivers/scsi/qla2xxx/qla_inline.h
5231 +++ b/drivers/scsi/qla2xxx/qla_inline.h
5232 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
5233  {
5234         unsigned long flags;
5235         struct qla_hw_data *ha = rsp->hw;
5236 -       local_irq_save(flags);
5237 +       local_irq_save_nort(flags);
5238         if (IS_P3P_TYPE(ha))
5239                 qla82xx_poll(0, rsp);
5240         else
5241                 ha->isp_ops->intr_handler(0, rsp);
5242 -       local_irq_restore(flags);
5243 +       local_irq_restore_nort(flags);
5244  }
5245
5246  static inline uint8_t *
5247 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
5248 index 068c4e47fac9..a2090f640397 100644
5249 --- a/drivers/scsi/qla2xxx/qla_isr.c
5250 +++ b/drivers/scsi/qla2xxx/qla_isr.c
5251 @@ -3125,7 +3125,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
5252                 * kref_put().
5253                 */
5254                 kref_get(&qentry->irq_notify.kref);
5255 +#ifdef CONFIG_PREEMPT_RT_BASE
5256 +               swork_queue(&qentry->irq_notify.swork);
5257 +#else
5258                 schedule_work(&qentry->irq_notify.work);
5259 +#endif
5260         }
5261
5262         /*
5263 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
5264 index 95f4c1bcdb4c..0be934799bff 100644
5265 --- a/drivers/thermal/x86_pkg_temp_thermal.c
5266 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
5267 @@ -29,6 +29,7 @@
5268  #include <linux/pm.h>
5269  #include <linux/thermal.h>
5270  #include <linux/debugfs.h>
5271 +#include <linux/swork.h>
5272  #include <asm/cpu_device_id.h>
5273  #include <asm/mce.h>
5274
5275 @@ -353,7 +354,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
5276         }
5277  }
5278
5279 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5280 +static void platform_thermal_notify_work(struct swork_event *event)
5281  {
5282         unsigned long flags;
5283         int cpu = smp_processor_id();
5284 @@ -370,7 +371,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5285                         pkg_work_scheduled[phy_id]) {
5286                 disable_pkg_thres_interrupt();
5287                 spin_unlock_irqrestore(&pkg_work_lock, flags);
5288 -               return -EINVAL;
5289 +               return;
5290         }
5291         pkg_work_scheduled[phy_id] = 1;
5292         spin_unlock_irqrestore(&pkg_work_lock, flags);
5293 @@ -379,9 +380,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5294         schedule_delayed_work_on(cpu,
5295                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
5296                                 msecs_to_jiffies(notify_delay_ms));
5297 +}
5298 +
5299 +#ifdef CONFIG_PREEMPT_RT_FULL
5300 +static struct swork_event notify_work;
5301 +
5302 +static int thermal_notify_work_init(void)
5303 +{
5304 +       int err;
5305 +
5306 +       err = swork_get();
5307 +       if (err)
5308 +               return err;
5309 +
5310 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
5311         return 0;
5312  }
5313
5314 +static void thermal_notify_work_cleanup(void)
5315 +{
5316 +       swork_put();
5317 +}
5318 +
5319 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5320 +{
5321 +       swork_queue(&notify_work);
5322 +       return 0;
5323 +}
5324 +
5325 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5326 +
5327 +static int thermal_notify_work_init(void) { return 0; }
5328 +
5329 +static void thermal_notify_work_cleanup(void) {  }
5330 +
5331 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5332 +{
5333 +       platform_thermal_notify_work(NULL);
5334 +
5335 +       return 0;
5336 +}
5337 +#endif /* CONFIG_PREEMPT_RT_FULL */
5338 +
5339  static int find_siblings_cpu(int cpu)
5340  {
5341         int i;
5342 @@ -585,6 +625,9 @@ static int __init pkg_temp_thermal_init(void)
5343         if (!x86_match_cpu(pkg_temp_thermal_ids))
5344                 return -ENODEV;
5345
5346 +       if (!thermal_notify_work_init())
5347 +               return -ENODEV;
5348 +
5349         spin_lock_init(&pkg_work_lock);
5350         platform_thermal_package_notify =
5351                         pkg_temp_thermal_platform_thermal_notify;
5352 @@ -609,7 +652,7 @@ static int __init pkg_temp_thermal_init(void)
5353         kfree(pkg_work_scheduled);
5354         platform_thermal_package_notify = NULL;
5355         platform_thermal_package_rate_control = NULL;
5356 -
5357 +       thermal_notify_work_cleanup();
5358         return -ENODEV;
5359  }
5360
5361 @@ -634,6 +677,7 @@ static void __exit pkg_temp_thermal_exit(void)
5362         mutex_unlock(&phy_dev_list_mutex);
5363         platform_thermal_package_notify = NULL;
5364         platform_thermal_package_rate_control = NULL;
5365 +       thermal_notify_work_cleanup();
5366         for_each_online_cpu(i)
5367                 cancel_delayed_work_sync(
5368                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5369 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
5370 index e8819aa20415..dd7f9bf45d6c 100644
5371 --- a/drivers/tty/serial/8250/8250_core.c
5372 +++ b/drivers/tty/serial/8250/8250_core.c
5373 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
5374
5375  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5376
5377 -#define PASS_LIMIT     512
5378 +/*
5379 + * On -rt we can have a more delays, and legitimately
5380 + * so - so don't drop work spuriously and spam the
5381 + * syslog:
5382 + */
5383 +#ifdef CONFIG_PREEMPT_RT_FULL
5384 +# define PASS_LIMIT    1000000
5385 +#else
5386 +# define PASS_LIMIT    512
5387 +#endif
5388
5389  #include <asm/serial.h>
5390  /*
5391 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
5392 index 080d5a59d0a7..eecc4f111473 100644
5393 --- a/drivers/tty/serial/8250/8250_port.c
5394 +++ b/drivers/tty/serial/8250/8250_port.c
5395 @@ -35,6 +35,7 @@
5396  #include <linux/nmi.h>
5397  #include <linux/mutex.h>
5398  #include <linux/slab.h>
5399 +#include <linux/kdb.h>
5400  #include <linux/uaccess.h>
5401  #include <linux/pm_runtime.h>
5402  #include <linux/timer.h>
5403 @@ -3144,9 +3145,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
5404
5405         serial8250_rpm_get(up);
5406
5407 -       if (port->sysrq)
5408 +       if (port->sysrq || oops_in_progress)
5409                 locked = 0;
5410 -       else if (oops_in_progress)
5411 +       else if (in_kdb_printk())
5412                 locked = spin_trylock_irqsave(&port->lock, flags);
5413         else
5414                 spin_lock_irqsave(&port->lock, flags);
5415 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
5416 index e2c33b9528d8..53af53c43e8c 100644
5417 --- a/drivers/tty/serial/amba-pl011.c
5418 +++ b/drivers/tty/serial/amba-pl011.c
5419 @@ -2194,13 +2194,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5420
5421         clk_enable(uap->clk);
5422
5423 -       local_irq_save(flags);
5424 +       /*
5425 +        * local_irq_save(flags);
5426 +        *
5427 +        * This local_irq_save() is nonsense. If we come in via sysrq
5428 +        * handling then interrupts are already disabled. Aside of
5429 +        * that the port.sysrq check is racy on SMP regardless.
5430 +       */
5431         if (uap->port.sysrq)
5432                 locked = 0;
5433         else if (oops_in_progress)
5434 -               locked = spin_trylock(&uap->port.lock);
5435 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5436         else
5437 -               spin_lock(&uap->port.lock);
5438 +               spin_lock_irqsave(&uap->port.lock, flags);
5439
5440         /*
5441          *      First save the CR then disable the interrupts
5442 @@ -2224,8 +2230,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5443                 pl011_write(old_cr, uap, REG_CR);
5444
5445         if (locked)
5446 -               spin_unlock(&uap->port.lock);
5447 -       local_irq_restore(flags);
5448 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5449
5450         clk_disable(uap->clk);
5451  }
5452 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
5453 index a2a529994ba5..0ee7c4c518df 100644
5454 --- a/drivers/tty/serial/omap-serial.c
5455 +++ b/drivers/tty/serial/omap-serial.c
5456 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
5457
5458         pm_runtime_get_sync(up->dev);
5459
5460 -       local_irq_save(flags);
5461 -       if (up->port.sysrq)
5462 -               locked = 0;
5463 -       else if (oops_in_progress)
5464 -               locked = spin_trylock(&up->port.lock);
5465 +       if (up->port.sysrq || oops_in_progress)
5466 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5467         else
5468 -               spin_lock(&up->port.lock);
5469 +               spin_lock_irqsave(&up->port.lock, flags);
5470
5471         /*
5472          * First save the IER then disable the interrupts
5473 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
5474         pm_runtime_mark_last_busy(up->dev);
5475         pm_runtime_put_autosuspend(up->dev);
5476         if (locked)
5477 -               spin_unlock(&up->port.lock);
5478 -       local_irq_restore(flags);
5479 +               spin_unlock_irqrestore(&up->port.lock, flags);
5480  }
5481
5482  static int __init
5483 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
5484 index f029aad67183..87c026876640 100644
5485 --- a/drivers/usb/core/hcd.c
5486 +++ b/drivers/usb/core/hcd.c
5487 @@ -1764,9 +1764,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
5488          * and no one may trigger the above deadlock situation when
5489          * running complete() in tasklet.
5490          */
5491 -       local_irq_save(flags);
5492 +       local_irq_save_nort(flags);
5493         urb->complete(urb);
5494 -       local_irq_restore(flags);
5495 +       local_irq_restore_nort(flags);
5496
5497         usb_anchor_resume_wakeups(anchor);
5498         atomic_dec(&urb->use_count);
5499 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
5500 index 89081b834615..90b231b7ad0a 100644
5501 --- a/drivers/usb/gadget/function/f_fs.c
5502 +++ b/drivers/usb/gadget/function/f_fs.c
5503 @@ -1593,7 +1593,7 @@ static void ffs_data_put(struct ffs_data *ffs)
5504                 pr_info("%s(): freeing\n", __func__);
5505                 ffs_data_clear(ffs);
5506                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5507 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5508 +                      swait_active(&ffs->ep0req_completion.wait));
5509                 kfree(ffs->dev_name);
5510                 kfree(ffs);
5511         }
5512 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
5513 index 1468d8f085a3..6aae3ae25c18 100644
5514 --- a/drivers/usb/gadget/legacy/inode.c
5515 +++ b/drivers/usb/gadget/legacy/inode.c
5516 @@ -346,7 +346,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5517         spin_unlock_irq (&epdata->dev->lock);
5518
5519         if (likely (value == 0)) {
5520 -               value = wait_event_interruptible (done.wait, done.done);
5521 +               value = swait_event_interruptible (done.wait, done.done);
5522                 if (value != 0) {
5523                         spin_lock_irq (&epdata->dev->lock);
5524                         if (likely (epdata->ep != NULL)) {
5525 @@ -355,7 +355,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5526                                 usb_ep_dequeue (epdata->ep, epdata->req);
5527                                 spin_unlock_irq (&epdata->dev->lock);
5528
5529 -                               wait_event (done.wait, done.done);
5530 +                               swait_event (done.wait, done.done);
5531                                 if (epdata->status == -ECONNRESET)
5532                                         epdata->status = -EINTR;
5533                         } else {
5534 diff --git a/fs/aio.c b/fs/aio.c
5535 index 428484f2f841..2b02e2eb2158 100644
5536 --- a/fs/aio.c
5537 +++ b/fs/aio.c
5538 @@ -40,6 +40,7 @@
5539  #include <linux/ramfs.h>
5540  #include <linux/percpu-refcount.h>
5541  #include <linux/mount.h>
5542 +#include <linux/swork.h>
5543
5544  #include <asm/kmap_types.h>
5545  #include <asm/uaccess.h>
5546 @@ -115,7 +116,7 @@ struct kioctx {
5547         struct page             **ring_pages;
5548         long                    nr_pages;
5549
5550 -       struct work_struct      free_work;
5551 +       struct swork_event      free_work;
5552
5553         /*
5554          * signals when all in-flight requests are done
5555 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
5556                 .mount          = aio_mount,
5557                 .kill_sb        = kill_anon_super,
5558         };
5559 +       BUG_ON(swork_get());
5560         aio_mnt = kern_mount(&aio_fs);
5561         if (IS_ERR(aio_mnt))
5562                 panic("Failed to create aio fs mount.");
5563 @@ -581,9 +583,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
5564         return cancel(&kiocb->common);
5565  }
5566
5567 -static void free_ioctx(struct work_struct *work)
5568 +static void free_ioctx(struct swork_event *sev)
5569  {
5570 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5571 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5572
5573         pr_debug("freeing %p\n", ctx);
5574
5575 @@ -602,8 +604,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5576         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5577                 complete(&ctx->rq_wait->comp);
5578
5579 -       INIT_WORK(&ctx->free_work, free_ioctx);
5580 -       schedule_work(&ctx->free_work);
5581 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5582 +       swork_queue(&ctx->free_work);
5583  }
5584
5585  /*
5586 @@ -611,9 +613,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5587   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5588   * now it's safe to cancel any that need to be.
5589   */
5590 -static void free_ioctx_users(struct percpu_ref *ref)
5591 +static void free_ioctx_users_work(struct swork_event *sev)
5592  {
5593 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5594 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5595         struct aio_kiocb *req;
5596
5597         spin_lock_irq(&ctx->ctx_lock);
5598 @@ -632,6 +634,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
5599         percpu_ref_put(&ctx->reqs);
5600  }
5601
5602 +static void free_ioctx_users(struct percpu_ref *ref)
5603 +{
5604 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5605 +
5606 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5607 +       swork_queue(&ctx->free_work);
5608 +}
5609 +
5610  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5611  {
5612         unsigned i, new_nr;
5613 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
5614 index a1fba4285277..3796769b4cd1 100644
5615 --- a/fs/autofs4/autofs_i.h
5616 +++ b/fs/autofs4/autofs_i.h
5617 @@ -31,6 +31,7 @@
5618  #include <linux/sched.h>
5619  #include <linux/mount.h>
5620  #include <linux/namei.h>
5621 +#include <linux/delay.h>
5622  #include <asm/current.h>
5623  #include <linux/uaccess.h>
5624
5625 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
5626 index d8e6d421c27f..2e689ab1306b 100644
5627 --- a/fs/autofs4/expire.c
5628 +++ b/fs/autofs4/expire.c
5629 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
5630                         parent = p->d_parent;
5631                         if (!spin_trylock(&parent->d_lock)) {
5632                                 spin_unlock(&p->d_lock);
5633 -                               cpu_relax();
5634 +                               cpu_chill();
5635                                 goto relock;
5636                         }
5637                         spin_unlock(&p->d_lock);
5638 diff --git a/fs/buffer.c b/fs/buffer.c
5639 index b205a629001d..5646afc022ba 100644
5640 --- a/fs/buffer.c
5641 +++ b/fs/buffer.c
5642 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5643          * decide that the page is now completely done.
5644          */
5645         first = page_buffers(page);
5646 -       local_irq_save(flags);
5647 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5648 +       flags = bh_uptodate_lock_irqsave(first);
5649         clear_buffer_async_read(bh);
5650         unlock_buffer(bh);
5651         tmp = bh;
5652 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5653                 }
5654                 tmp = tmp->b_this_page;
5655         } while (tmp != bh);
5656 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5657 -       local_irq_restore(flags);
5658 +       bh_uptodate_unlock_irqrestore(first, flags);
5659
5660         /*
5661          * If none of the buffers had errors and they are all
5662 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5663         return;
5664
5665  still_busy:
5666 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5667 -       local_irq_restore(flags);
5668 -       return;
5669 +       bh_uptodate_unlock_irqrestore(first, flags);
5670  }
5671
5672  /*
5673 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5674         }
5675
5676         first = page_buffers(page);
5677 -       local_irq_save(flags);
5678 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5679 +       flags = bh_uptodate_lock_irqsave(first);
5680
5681         clear_buffer_async_write(bh);
5682         unlock_buffer(bh);
5683 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5684                 }
5685                 tmp = tmp->b_this_page;
5686         }
5687 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5688 -       local_irq_restore(flags);
5689 +       bh_uptodate_unlock_irqrestore(first, flags);
5690         end_page_writeback(page);
5691         return;
5692
5693  still_busy:
5694 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5695 -       local_irq_restore(flags);
5696 -       return;
5697 +       bh_uptodate_unlock_irqrestore(first, flags);
5698  }
5699  EXPORT_SYMBOL(end_buffer_async_write);
5700
5701 @@ -3383,6 +3375,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
5702         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5703         if (ret) {
5704                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5705 +               buffer_head_init_locks(ret);
5706                 preempt_disable();
5707                 __this_cpu_inc(bh_accounting.nr);
5708                 recalc_bh_state();
5709 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5710 index a27fc8791551..791aecb7c1ac 100644
5711 --- a/fs/cifs/readdir.c
5712 +++ b/fs/cifs/readdir.c
5713 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
5714         struct inode *inode;
5715         struct super_block *sb = parent->d_sb;
5716         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5717 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5718 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5719
5720         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5721
5722 diff --git a/fs/dcache.c b/fs/dcache.c
5723 index 4485a48f4091..691039a6a872 100644
5724 --- a/fs/dcache.c
5725 +++ b/fs/dcache.c
5726 @@ -19,6 +19,7 @@
5727  #include <linux/mm.h>
5728  #include <linux/fs.h>
5729  #include <linux/fsnotify.h>
5730 +#include <linux/delay.h>
5731  #include <linux/slab.h>
5732  #include <linux/init.h>
5733  #include <linux/hash.h>
5734 @@ -750,6 +751,8 @@ static inline bool fast_dput(struct dentry *dentry)
5735   */
5736  void dput(struct dentry *dentry)
5737  {
5738 +       struct dentry *parent;
5739 +
5740         if (unlikely(!dentry))
5741                 return;
5742
5743 @@ -788,9 +791,18 @@ void dput(struct dentry *dentry)
5744         return;
5745
5746  kill_it:
5747 -       dentry = dentry_kill(dentry);
5748 -       if (dentry) {
5749 -               cond_resched();
5750 +       parent = dentry_kill(dentry);
5751 +       if (parent) {
5752 +               int r;
5753 +
5754 +               if (parent == dentry) {
5755 +                       /* the task with the highest priority won't schedule */
5756 +                       r = cond_resched();
5757 +                       if (!r)
5758 +                               cpu_chill();
5759 +               } else {
5760 +                       dentry = parent;
5761 +               }
5762                 goto repeat;
5763         }
5764  }
5765 @@ -2324,7 +2336,7 @@ void d_delete(struct dentry * dentry)
5766         if (dentry->d_lockref.count == 1) {
5767                 if (!spin_trylock(&inode->i_lock)) {
5768                         spin_unlock(&dentry->d_lock);
5769 -                       cpu_relax();
5770 +                       cpu_chill();
5771                         goto again;
5772                 }
5773                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5774 @@ -2384,21 +2396,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
5775
5776  static void d_wait_lookup(struct dentry *dentry)
5777  {
5778 -       if (d_in_lookup(dentry)) {
5779 -               DECLARE_WAITQUEUE(wait, current);
5780 -               add_wait_queue(dentry->d_wait, &wait);
5781 -               do {
5782 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5783 -                       spin_unlock(&dentry->d_lock);
5784 -                       schedule();
5785 -                       spin_lock(&dentry->d_lock);
5786 -               } while (d_in_lookup(dentry));
5787 -       }
5788 +       struct swait_queue __wait;
5789 +
5790 +       if (!d_in_lookup(dentry))
5791 +               return;
5792 +
5793 +       INIT_LIST_HEAD(&__wait.task_list);
5794 +       do {
5795 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5796 +               spin_unlock(&dentry->d_lock);
5797 +               schedule();
5798 +               spin_lock(&dentry->d_lock);
5799 +       } while (d_in_lookup(dentry));
5800 +       finish_swait(dentry->d_wait, &__wait);
5801  }
5802
5803  struct dentry *d_alloc_parallel(struct dentry *parent,
5804                                 const struct qstr *name,
5805 -                               wait_queue_head_t *wq)
5806 +                               struct swait_queue_head *wq)
5807  {
5808         unsigned int hash = name->hash;
5809         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5810 @@ -2507,7 +2522,7 @@ void __d_lookup_done(struct dentry *dentry)
5811         hlist_bl_lock(b);
5812         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5813         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5814 -       wake_up_all(dentry->d_wait);
5815 +       swake_up_all(dentry->d_wait);
5816         dentry->d_wait = NULL;
5817         hlist_bl_unlock(b);
5818         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5819 @@ -3604,6 +3619,11 @@ EXPORT_SYMBOL(d_genocide);
5820
5821  void __init vfs_caches_init_early(void)
5822  {
5823 +       int i;
5824 +
5825 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5826 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5827 +
5828         dcache_init_early();
5829         inode_init_early();
5830  }
5831 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
5832 index 10db91218933..42af0a06f657 100644
5833 --- a/fs/eventpoll.c
5834 +++ b/fs/eventpoll.c
5835 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
5836   */
5837  static void ep_poll_safewake(wait_queue_head_t *wq)
5838  {
5839 -       int this_cpu = get_cpu();
5840 +       int this_cpu = get_cpu_light();
5841
5842         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5843                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5844
5845 -       put_cpu();
5846 +       put_cpu_light();
5847  }
5848
5849  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5850 diff --git a/fs/exec.c b/fs/exec.c
5851 index 67e86571685a..fe14cdd84016 100644
5852 --- a/fs/exec.c
5853 +++ b/fs/exec.c
5854 @@ -1017,12 +1017,14 @@ static int exec_mmap(struct mm_struct *mm)
5855                 }
5856         }
5857         task_lock(tsk);
5858 +       preempt_disable_rt();
5859         active_mm = tsk->active_mm;
5860         tsk->mm = mm;
5861         tsk->active_mm = mm;
5862         activate_mm(active_mm, mm);
5863         tsk->mm->vmacache_seqnum = 0;
5864         vmacache_flush(tsk);
5865 +       preempt_enable_rt();
5866         task_unlock(tsk);
5867         if (old_mm) {
5868                 up_read(&old_mm->mmap_sem);
5869 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
5870 index 642c57b8de7b..8494b9308333 100644
5871 --- a/fs/fuse/dir.c
5872 +++ b/fs/fuse/dir.c
5873 @@ -1191,7 +1191,7 @@ static int fuse_direntplus_link(struct file *file,
5874         struct inode *dir = d_inode(parent);
5875         struct fuse_conn *fc;
5876         struct inode *inode;
5877 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5878 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5879
5880         if (!o->nodeid) {
5881                 /*
5882 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
5883 index 684996c8a3a4..6e18a06aaabe 100644
5884 --- a/fs/jbd2/checkpoint.c
5885 +++ b/fs/jbd2/checkpoint.c
5886 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
5887         nblocks = jbd2_space_needed(journal);
5888         while (jbd2_log_space_left(journal) < nblocks) {
5889                 write_unlock(&journal->j_state_lock);
5890 +               if (current->plug)
5891 +                       io_schedule();
5892                 mutex_lock(&journal->j_checkpoint_mutex);
5893
5894                 /*
5895 diff --git a/fs/locks.c b/fs/locks.c
5896 index 22c5b4aa4961..269c6a44449a 100644
5897 --- a/fs/locks.c
5898 +++ b/fs/locks.c
5899 @@ -935,7 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5900                         return -ENOMEM;
5901         }
5902
5903 -       percpu_down_read_preempt_disable(&file_rwsem);
5904 +       percpu_down_read(&file_rwsem);
5905         spin_lock(&ctx->flc_lock);
5906         if (request->fl_flags & FL_ACCESS)
5907                 goto find_conflict;
5908 @@ -976,7 +976,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5909
5910  out:
5911         spin_unlock(&ctx->flc_lock);
5912 -       percpu_up_read_preempt_enable(&file_rwsem);
5913 +       percpu_up_read(&file_rwsem);
5914         if (new_fl)
5915                 locks_free_lock(new_fl);
5916         locks_dispose_list(&dispose);
5917 @@ -1013,7 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5918                 new_fl2 = locks_alloc_lock();
5919         }
5920
5921 -       percpu_down_read_preempt_disable(&file_rwsem);
5922 +       percpu_down_read(&file_rwsem);
5923         spin_lock(&ctx->flc_lock);
5924         /*
5925          * New lock request. Walk all POSIX locks and look for conflicts. If
5926 @@ -1185,7 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5927         }
5928   out:
5929         spin_unlock(&ctx->flc_lock);
5930 -       percpu_up_read_preempt_enable(&file_rwsem);
5931 +       percpu_up_read(&file_rwsem);
5932         /*
5933          * Free any unused locks.
5934          */
5935 @@ -1460,7 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5936                 return error;
5937         }
5938
5939 -       percpu_down_read_preempt_disable(&file_rwsem);
5940 +       percpu_down_read(&file_rwsem);
5941         spin_lock(&ctx->flc_lock);
5942
5943         time_out_leases(inode, &dispose);
5944 @@ -1512,13 +1512,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5945         locks_insert_block(fl, new_fl);
5946         trace_break_lease_block(inode, new_fl);
5947         spin_unlock(&ctx->flc_lock);
5948 -       percpu_up_read_preempt_enable(&file_rwsem);
5949 +       percpu_up_read(&file_rwsem);
5950
5951         locks_dispose_list(&dispose);
5952         error = wait_event_interruptible_timeout(new_fl->fl_wait,
5953                                                 !new_fl->fl_next, break_time);
5954
5955 -       percpu_down_read_preempt_disable(&file_rwsem);
5956 +       percpu_down_read(&file_rwsem);
5957         spin_lock(&ctx->flc_lock);
5958         trace_break_lease_unblock(inode, new_fl);
5959         locks_delete_block(new_fl);
5960 @@ -1535,7 +1535,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5961         }
5962  out:
5963         spin_unlock(&ctx->flc_lock);
5964 -       percpu_up_read_preempt_enable(&file_rwsem);
5965 +       percpu_up_read(&file_rwsem);
5966         locks_dispose_list(&dispose);
5967         locks_free_lock(new_fl);
5968         return error;
5969 @@ -1609,7 +1609,7 @@ int fcntl_getlease(struct file *filp)
5970
5971         ctx = smp_load_acquire(&inode->i_flctx);
5972         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5973 -               percpu_down_read_preempt_disable(&file_rwsem);
5974 +               percpu_down_read(&file_rwsem);
5975                 spin_lock(&ctx->flc_lock);
5976                 time_out_leases(inode, &dispose);
5977                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5978 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
5979                         break;
5980                 }
5981                 spin_unlock(&ctx->flc_lock);
5982 -               percpu_up_read_preempt_enable(&file_rwsem);
5983 +               percpu_up_read(&file_rwsem);
5984
5985                 locks_dispose_list(&dispose);
5986         }
5987 @@ -1694,7 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5988                 return -EINVAL;
5989         }
5990
5991 -       percpu_down_read_preempt_disable(&file_rwsem);
5992 +       percpu_down_read(&file_rwsem);
5993         spin_lock(&ctx->flc_lock);
5994         time_out_leases(inode, &dispose);
5995         error = check_conflicting_open(dentry, arg, lease->fl_flags);
5996 @@ -1765,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5997                 lease->fl_lmops->lm_setup(lease, priv);
5998  out:
5999         spin_unlock(&ctx->flc_lock);
6000 -       percpu_up_read_preempt_enable(&file_rwsem);
6001 +       percpu_up_read(&file_rwsem);
6002         locks_dispose_list(&dispose);
6003         if (is_deleg)
6004                 inode_unlock(inode);
6005 @@ -1788,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
6006                 return error;
6007         }
6008
6009 -       percpu_down_read_preempt_disable(&file_rwsem);
6010 +       percpu_down_read(&file_rwsem);
6011         spin_lock(&ctx->flc_lock);
6012         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
6013                 if (fl->fl_file == filp &&
6014 @@ -1801,7 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
6015         if (victim)
6016                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
6017         spin_unlock(&ctx->flc_lock);
6018 -       percpu_up_read_preempt_enable(&file_rwsem);
6019 +       percpu_up_read(&file_rwsem);
6020         locks_dispose_list(&dispose);
6021         return error;
6022  }
6023 @@ -2532,13 +2532,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
6024         if (list_empty(&ctx->flc_lease))
6025                 return;
6026
6027 -       percpu_down_read_preempt_disable(&file_rwsem);
6028 +       percpu_down_read(&file_rwsem);
6029         spin_lock(&ctx->flc_lock);
6030         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
6031                 if (filp == fl->fl_file)
6032                         lease_modify(fl, F_UNLCK, &dispose);
6033         spin_unlock(&ctx->flc_lock);
6034 -       percpu_up_read_preempt_enable(&file_rwsem);
6035 +       percpu_up_read(&file_rwsem);
6036
6037         locks_dispose_list(&dispose);
6038  }
6039 diff --git a/fs/namei.c b/fs/namei.c
6040 index d5e5140c1045..150fbdd8e04c 100644
6041 --- a/fs/namei.c
6042 +++ b/fs/namei.c
6043 @@ -1626,7 +1626,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
6044  {
6045         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
6046         struct inode *inode = dir->d_inode;
6047 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6048 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6049
6050         inode_lock_shared(inode);
6051         /* Don't go there if it's already dead */
6052 @@ -3083,7 +3083,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
6053         struct dentry *dentry;
6054         int error, create_error = 0;
6055         umode_t mode = op->mode;
6056 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6057 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6058
6059         if (unlikely(IS_DEADDIR(dir_inode)))
6060                 return -ENOENT;
6061 diff --git a/fs/namespace.c b/fs/namespace.c
6062 index 5e35057f07ac..843d274ba167 100644
6063 --- a/fs/namespace.c
6064 +++ b/fs/namespace.c
6065 @@ -14,6 +14,7 @@
6066  #include <linux/mnt_namespace.h>
6067  #include <linux/user_namespace.h>
6068  #include <linux/namei.h>
6069 +#include <linux/delay.h>
6070  #include <linux/security.h>
6071  #include <linux/idr.h>
6072  #include <linux/init.h>                /* init_rootfs */
6073 @@ -356,8 +357,11 @@ int __mnt_want_write(struct vfsmount *m)
6074          * incremented count after it has set MNT_WRITE_HOLD.
6075          */
6076         smp_mb();
6077 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
6078 -               cpu_relax();
6079 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
6080 +               preempt_enable();
6081 +               cpu_chill();
6082 +               preempt_disable();
6083 +       }
6084         /*
6085          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
6086          * be set to match its requirements. So we must not load that until
6087 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
6088 index dff600ae0d74..d726d2e09353 100644
6089 --- a/fs/nfs/delegation.c
6090 +++ b/fs/nfs/delegation.c
6091 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
6092                 sp = state->owner;
6093                 /* Block nfs4_proc_unlck */
6094                 mutex_lock(&sp->so_delegreturn_mutex);
6095 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6096 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
6097                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
6098                 if (!err)
6099                         err = nfs_delegation_claim_locks(ctx, state, stateid);
6100 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6101 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
6102                         err = -EAGAIN;
6103                 mutex_unlock(&sp->so_delegreturn_mutex);
6104                 put_nfs_open_context(ctx);
6105 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
6106 index 53e02b8bd9bd..a66e7d77cfbb 100644
6107 --- a/fs/nfs/dir.c
6108 +++ b/fs/nfs/dir.c
6109 @@ -485,7 +485,7 @@ static
6110  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
6111  {
6112         struct qstr filename = QSTR_INIT(entry->name, entry->len);
6113 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6114 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6115         struct dentry *dentry;
6116         struct dentry *alias;
6117         struct inode *dir = d_inode(parent);
6118 @@ -1487,7 +1487,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
6119                     struct file *file, unsigned open_flags,
6120                     umode_t mode, int *opened)
6121  {
6122 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6123 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6124         struct nfs_open_context *ctx;
6125         struct dentry *res;
6126         struct iattr attr = { .ia_valid = ATTR_OPEN };
6127 @@ -1802,7 +1802,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6128
6129         trace_nfs_rmdir_enter(dir, dentry);
6130         if (d_really_is_positive(dentry)) {
6131 +#ifdef CONFIG_PREEMPT_RT_BASE
6132 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
6133 +#else
6134                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6135 +#endif
6136                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6137                 /* Ensure the VFS deletes this inode */
6138                 switch (error) {
6139 @@ -1812,7 +1816,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6140                 case -ENOENT:
6141                         nfs_dentry_handle_enoent(dentry);
6142                 }
6143 +#ifdef CONFIG_PREEMPT_RT_BASE
6144 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
6145 +#else
6146                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6147 +#endif
6148         } else
6149                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6150         trace_nfs_rmdir_exit(dir, dentry, error);
6151 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
6152 index bf4ec5ecc97e..36cd5fc9192c 100644
6153 --- a/fs/nfs/inode.c
6154 +++ b/fs/nfs/inode.c
6155 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
6156         nfsi->nrequests = 0;
6157         nfsi->commit_info.ncommit = 0;
6158         atomic_set(&nfsi->commit_info.rpcs_out, 0);
6159 +#ifdef CONFIG_PREEMPT_RT_BASE
6160 +       sema_init(&nfsi->rmdir_sem, 1);
6161 +#else
6162         init_rwsem(&nfsi->rmdir_sem);
6163 +#endif
6164         nfs4_init_once(nfsi);
6165  }
6166
6167 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
6168 index 1452177c822d..f43b01d54c59 100644
6169 --- a/fs/nfs/nfs4_fs.h
6170 +++ b/fs/nfs/nfs4_fs.h
6171 @@ -111,7 +111,7 @@ struct nfs4_state_owner {
6172         unsigned long        so_flags;
6173         struct list_head     so_states;
6174         struct nfs_seqid_counter so_seqid;
6175 -       seqcount_t           so_reclaim_seqcount;
6176 +       seqlock_t            so_reclaim_seqlock;
6177         struct mutex         so_delegreturn_mutex;
6178  };
6179
6180 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
6181 index 4e894d301c88..3300a4b5c87c 100644
6182 --- a/fs/nfs/nfs4proc.c
6183 +++ b/fs/nfs/nfs4proc.c
6184 @@ -2695,7 +2695,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6185         unsigned int seq;
6186         int ret;
6187
6188 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6189 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6190
6191         ret = _nfs4_proc_open(opendata);
6192         if (ret != 0)
6193 @@ -2733,7 +2733,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6194
6195         if (d_inode(dentry) == state->inode) {
6196                 nfs_inode_attach_open_context(ctx);
6197 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6198 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
6199                         nfs4_schedule_stateid_recovery(server, state);
6200         }
6201  out:
6202 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
6203 index 0959c9661662..dabd834d7686 100644
6204 --- a/fs/nfs/nfs4state.c
6205 +++ b/fs/nfs/nfs4state.c
6206 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
6207         nfs4_init_seqid_counter(&sp->so_seqid);
6208         atomic_set(&sp->so_count, 1);
6209         INIT_LIST_HEAD(&sp->so_lru);
6210 -       seqcount_init(&sp->so_reclaim_seqcount);
6211 +       seqlock_init(&sp->so_reclaim_seqlock);
6212         mutex_init(&sp->so_delegreturn_mutex);
6213         return sp;
6214  }
6215 @@ -1497,8 +1497,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6216          * recovering after a network partition or a reboot from a
6217          * server that doesn't support a grace period.
6218          */
6219 +#ifdef CONFIG_PREEMPT_RT_FULL
6220 +       write_seqlock(&sp->so_reclaim_seqlock);
6221 +#else
6222 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6223 +#endif
6224         spin_lock(&sp->so_lock);
6225 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
6226  restart:
6227         list_for_each_entry(state, &sp->so_states, open_states) {
6228                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
6229 @@ -1567,14 +1571,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6230                 spin_lock(&sp->so_lock);
6231                 goto restart;
6232         }
6233 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6234         spin_unlock(&sp->so_lock);
6235 +#ifdef CONFIG_PREEMPT_RT_FULL
6236 +       write_sequnlock(&sp->so_reclaim_seqlock);
6237 +#else
6238 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6239 +#endif
6240         return 0;
6241  out_err:
6242         nfs4_put_open_state(state);
6243 -       spin_lock(&sp->so_lock);
6244 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6245 -       spin_unlock(&sp->so_lock);
6246 +#ifdef CONFIG_PREEMPT_RT_FULL
6247 +       write_sequnlock(&sp->so_reclaim_seqlock);
6248 +#else
6249 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6250 +#endif
6251         return status;
6252  }
6253
6254 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
6255 index 191aa577dd1f..58990c8f52e0 100644
6256 --- a/fs/nfs/unlink.c
6257 +++ b/fs/nfs/unlink.c
6258 @@ -12,7 +12,7 @@
6259  #include <linux/sunrpc/clnt.h>
6260  #include <linux/nfs_fs.h>
6261  #include <linux/sched.h>
6262 -#include <linux/wait.h>
6263 +#include <linux/swait.h>
6264  #include <linux/namei.h>
6265  #include <linux/fsnotify.h>
6266
6267 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
6268                 rpc_restart_call_prepare(task);
6269  }
6270
6271 +#ifdef CONFIG_PREEMPT_RT_BASE
6272 +static void nfs_down_anon(struct semaphore *sema)
6273 +{
6274 +       down(sema);
6275 +}
6276 +
6277 +static void nfs_up_anon(struct semaphore *sema)
6278 +{
6279 +       up(sema);
6280 +}
6281 +
6282 +#else
6283 +static void nfs_down_anon(struct rw_semaphore *rwsem)
6284 +{
6285 +       down_read_non_owner(rwsem);
6286 +}
6287 +
6288 +static void nfs_up_anon(struct rw_semaphore *rwsem)
6289 +{
6290 +       up_read_non_owner(rwsem);
6291 +}
6292 +#endif
6293 +
6294  /**
6295   * nfs_async_unlink_release - Release the sillydelete data.
6296   * @task: rpc_task of the sillydelete
6297 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
6298         struct dentry *dentry = data->dentry;
6299         struct super_block *sb = dentry->d_sb;
6300
6301 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6302 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6303         d_lookup_done(dentry);
6304         nfs_free_unlinkdata(data);
6305         dput(dentry);
6306 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6307         struct inode *dir = d_inode(dentry->d_parent);
6308         struct dentry *alias;
6309
6310 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6311 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6312         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6313         if (IS_ERR(alias)) {
6314 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6315 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6316                 return 0;
6317         }
6318         if (!d_in_lookup(alias)) {
6319 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6320                         ret = 0;
6321                 spin_unlock(&alias->d_lock);
6322                 dput(alias);
6323 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6324 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6325                 /*
6326                  * If we'd displaced old cached devname, free it.  At that
6327                  * point dentry is definitely not a root, so we won't need
6328 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
6329                 goto out_free_name;
6330         }
6331         data->res.dir_attr = &data->dir_attr;
6332 -       init_waitqueue_head(&data->wq);
6333 +       init_swait_queue_head(&data->wq);
6334
6335         status = -EBUSY;
6336         spin_lock(&dentry->d_lock);
6337 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
6338 index fe251f187ff8..e89da4fb14c2 100644
6339 --- a/fs/ntfs/aops.c
6340 +++ b/fs/ntfs/aops.c
6341 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6342                         ofs = 0;
6343                         if (file_ofs < init_size)
6344                                 ofs = init_size - file_ofs;
6345 -                       local_irq_save(flags);
6346 +                       local_irq_save_nort(flags);
6347                         kaddr = kmap_atomic(page);
6348                         memset(kaddr + bh_offset(bh) + ofs, 0,
6349                                         bh->b_size - ofs);
6350                         flush_dcache_page(page);
6351                         kunmap_atomic(kaddr);
6352 -                       local_irq_restore(flags);
6353 +                       local_irq_restore_nort(flags);
6354                 }
6355         } else {
6356                 clear_buffer_uptodate(bh);
6357 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6358                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6359         }
6360         first = page_buffers(page);
6361 -       local_irq_save(flags);
6362 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6363 +       flags = bh_uptodate_lock_irqsave(first);
6364         clear_buffer_async_read(bh);
6365         unlock_buffer(bh);
6366         tmp = bh;
6367 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6368                 }
6369                 tmp = tmp->b_this_page;
6370         } while (tmp != bh);
6371 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6372 -       local_irq_restore(flags);
6373 +       bh_uptodate_unlock_irqrestore(first, flags);
6374         /*
6375          * If none of the buffers had errors then we can set the page uptodate,
6376          * but we first have to perform the post read mst fixups, if the
6377 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6378                 recs = PAGE_SIZE / rec_size;
6379                 /* Should have been verified before we got here... */
6380                 BUG_ON(!recs);
6381 -               local_irq_save(flags);
6382 +               local_irq_save_nort(flags);
6383                 kaddr = kmap_atomic(page);
6384                 for (i = 0; i < recs; i++)
6385                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6386                                         i * rec_size), rec_size);
6387                 kunmap_atomic(kaddr);
6388 -               local_irq_restore(flags);
6389 +               local_irq_restore_nort(flags);
6390                 flush_dcache_page(page);
6391                 if (likely(page_uptodate && !PageError(page)))
6392                         SetPageUptodate(page);
6393 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6394         unlock_page(page);
6395         return;
6396  still_busy:
6397 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6398 -       local_irq_restore(flags);
6399 -       return;
6400 +       bh_uptodate_unlock_irqrestore(first, flags);
6401  }
6402
6403  /**
6404 diff --git a/fs/proc/base.c b/fs/proc/base.c
6405 index ca651ac00660..41d9dc789285 100644
6406 --- a/fs/proc/base.c
6407 +++ b/fs/proc/base.c
6408 @@ -1834,7 +1834,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
6409
6410         child = d_hash_and_lookup(dir, &qname);
6411         if (!child) {
6412 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6413 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6414                 child = d_alloc_parallel(dir, &qname, &wq);
6415                 if (IS_ERR(child))
6416                         goto end_instantiate;
6417 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
6418 index d4e37acd4821..000cea46434a 100644
6419 --- a/fs/proc/proc_sysctl.c
6420 +++ b/fs/proc/proc_sysctl.c
6421 @@ -632,7 +632,7 @@ static bool proc_sys_fill_cache(struct file *file,
6422
6423         child = d_lookup(dir, &qname);
6424         if (!child) {
6425 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6426 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6427                 child = d_alloc_parallel(dir, &qname, &wq);
6428                 if (IS_ERR(child))
6429                         return false;
6430 diff --git a/fs/timerfd.c b/fs/timerfd.c
6431 index ab8dd1538381..5580853f57dd 100644
6432 --- a/fs/timerfd.c
6433 +++ b/fs/timerfd.c
6434 @@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags,
6435                                 break;
6436                 }
6437                 spin_unlock_irq(&ctx->wqh.lock);
6438 -               cpu_relax();
6439 +               if (isalarm(ctx))
6440 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6441 +               else
6442 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6443         }
6444
6445         /*
6446 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
6447 index e861a24f06f2..b5c97d3059c7 100644
6448 --- a/include/acpi/platform/aclinux.h
6449 +++ b/include/acpi/platform/aclinux.h
6450 @@ -133,6 +133,7 @@
6451
6452  #define acpi_cache_t                        struct kmem_cache
6453  #define acpi_spinlock                       spinlock_t *
6454 +#define acpi_raw_spinlock              raw_spinlock_t *
6455  #define acpi_cpu_flags                      unsigned long
6456
6457  /* Use native linux version of acpi_os_allocate_zeroed */
6458 @@ -151,6 +152,20 @@
6459  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6460  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6461
6462 +#define acpi_os_create_raw_lock(__handle)                      \
6463 +({                                                             \
6464 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6465 +                                                               \
6466 +        if (lock) {                                            \
6467 +               *(__handle) = lock;                             \
6468 +               raw_spin_lock_init(*(__handle));                \
6469 +        }                                                      \
6470 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6471 + })
6472 +
6473 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6474 +
6475 +
6476  /*
6477   * OSL interfaces used by debugger/disassembler
6478   */
6479 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
6480 index 6f96247226a4..fa53a21263c2 100644
6481 --- a/include/asm-generic/bug.h
6482 +++ b/include/asm-generic/bug.h
6483 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
6484  # define WARN_ON_SMP(x)                        ({0;})
6485  #endif
6486
6487 +#ifdef CONFIG_PREEMPT_RT_BASE
6488 +# define BUG_ON_RT(c)                  BUG_ON(c)
6489 +# define BUG_ON_NONRT(c)               do { } while (0)
6490 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6491 +# define WARN_ON_NONRT(condition)      do { } while (0)
6492 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6493 +#else
6494 +# define BUG_ON_RT(c)                  do { } while (0)
6495 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6496 +# define WARN_ON_RT(condition)         do { } while (0)
6497 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6498 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6499 +#endif
6500 +
6501  #endif /* __ASSEMBLY__ */
6502
6503  #endif
6504 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
6505 index 535ab2e13d2e..cfc246899473 100644
6506 --- a/include/linux/blk-mq.h
6507 +++ b/include/linux/blk-mq.h
6508 @@ -209,7 +209,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
6509         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6510  }
6511
6512 -
6513 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6514  int blk_mq_request_started(struct request *rq);
6515  void blk_mq_start_request(struct request *rq);
6516  void blk_mq_end_request(struct request *rq, int error);
6517 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
6518 index f6a816129856..ec7a4676f8a8 100644
6519 --- a/include/linux/blkdev.h
6520 +++ b/include/linux/blkdev.h
6521 @@ -89,6 +89,7 @@ struct request {
6522         struct list_head queuelist;
6523         union {
6524                 struct call_single_data csd;
6525 +               struct work_struct work;
6526                 u64 fifo_time;
6527         };
6528
6529 @@ -467,7 +468,7 @@ struct request_queue {
6530         struct throtl_data *td;
6531  #endif
6532         struct rcu_head         rcu_head;
6533 -       wait_queue_head_t       mq_freeze_wq;
6534 +       struct swait_queue_head mq_freeze_wq;
6535         struct percpu_ref       q_usage_counter;
6536         struct list_head        all_q_node;
6537
6538 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
6539 index 8fdcb783197d..d07dbeec7bc1 100644
6540 --- a/include/linux/bottom_half.h
6541 +++ b/include/linux/bottom_half.h
6542 @@ -3,6 +3,39 @@
6543
6544  #include <linux/preempt.h>
6545
6546 +#ifdef CONFIG_PREEMPT_RT_FULL
6547 +
6548 +extern void __local_bh_disable(void);
6549 +extern void _local_bh_enable(void);
6550 +extern void __local_bh_enable(void);
6551 +
6552 +static inline void local_bh_disable(void)
6553 +{
6554 +       __local_bh_disable();
6555 +}
6556 +
6557 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6558 +{
6559 +       __local_bh_disable();
6560 +}
6561 +
6562 +static inline void local_bh_enable(void)
6563 +{
6564 +       __local_bh_enable();
6565 +}
6566 +
6567 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6568 +{
6569 +       __local_bh_enable();
6570 +}
6571 +
6572 +static inline void local_bh_enable_ip(unsigned long ip)
6573 +{
6574 +       __local_bh_enable();
6575 +}
6576 +
6577 +#else
6578 +
6579  #ifdef CONFIG_TRACE_IRQFLAGS
6580  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6581  #else
6582 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
6583  {
6584         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6585  }
6586 +#endif
6587
6588  #endif /* _LINUX_BH_H */
6589 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
6590 index ebbacd14d450..be5e87f6360a 100644
6591 --- a/include/linux/buffer_head.h
6592 +++ b/include/linux/buffer_head.h
6593 @@ -75,8 +75,50 @@ struct buffer_head {
6594         struct address_space *b_assoc_map;      /* mapping this buffer is
6595                                                    associated with */
6596         atomic_t b_count;               /* users using this buffer_head */
6597 +#ifdef CONFIG_PREEMPT_RT_BASE
6598 +       spinlock_t b_uptodate_lock;
6599 +#if IS_ENABLED(CONFIG_JBD2)
6600 +       spinlock_t b_state_lock;
6601 +       spinlock_t b_journal_head_lock;
6602 +#endif
6603 +#endif
6604  };
6605
6606 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6607 +{
6608 +       unsigned long flags;
6609 +
6610 +#ifndef CONFIG_PREEMPT_RT_BASE
6611 +       local_irq_save(flags);
6612 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6613 +#else
6614 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6615 +#endif
6616 +       return flags;
6617 +}
6618 +
6619 +static inline void
6620 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6621 +{
6622 +#ifndef CONFIG_PREEMPT_RT_BASE
6623 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6624 +       local_irq_restore(flags);
6625 +#else
6626 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6627 +#endif
6628 +}
6629 +
6630 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6631 +{
6632 +#ifdef CONFIG_PREEMPT_RT_BASE
6633 +       spin_lock_init(&bh->b_uptodate_lock);
6634 +#if IS_ENABLED(CONFIG_JBD2)
6635 +       spin_lock_init(&bh->b_state_lock);
6636 +       spin_lock_init(&bh->b_journal_head_lock);
6637 +#endif
6638 +#endif
6639 +}
6640 +
6641  /*
6642   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6643   * and buffer_foo() functions.
6644 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
6645 index 5b17de62c962..56027cc01a56 100644
6646 --- a/include/linux/cgroup-defs.h
6647 +++ b/include/linux/cgroup-defs.h
6648 @@ -16,6 +16,7 @@
6649  #include <linux/percpu-refcount.h>
6650  #include <linux/percpu-rwsem.h>
6651  #include <linux/workqueue.h>
6652 +#include <linux/swork.h>
6653
6654  #ifdef CONFIG_CGROUPS
6655
6656 @@ -137,6 +138,7 @@ struct cgroup_subsys_state {
6657         /* percpu_ref killing and RCU release */
6658         struct rcu_head rcu_head;
6659         struct work_struct destroy_work;
6660 +       struct swork_event destroy_swork;
6661  };
6662
6663  /*
6664 diff --git a/include/linux/completion.h b/include/linux/completion.h
6665 index 5d5aaae3af43..3bca1590e29f 100644
6666 --- a/include/linux/completion.h
6667 +++ b/include/linux/completion.h
6668 @@ -7,8 +7,7 @@
6669   * Atomic wait-for-completion handler data structures.
6670   * See kernel/sched/completion.c for details.
6671   */
6672 -
6673 -#include <linux/wait.h>
6674 +#include <linux/swait.h>
6675
6676  /*
6677   * struct completion - structure used to maintain state for a "completion"
6678 @@ -24,11 +23,11 @@
6679   */
6680  struct completion {
6681         unsigned int done;
6682 -       wait_queue_head_t wait;
6683 +       struct swait_queue_head wait;
6684  };
6685
6686  #define COMPLETION_INITIALIZER(work) \
6687 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6688 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6689
6690  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6691         ({ init_completion(&work); work; })
6692 @@ -73,7 +72,7 @@ struct completion {
6693  static inline void init_completion(struct completion *x)
6694  {
6695         x->done = 0;
6696 -       init_waitqueue_head(&x->wait);
6697 +       init_swait_queue_head(&x->wait);
6698  }
6699
6700  /**
6701 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
6702 index e571128ad99a..5e52d28c20c1 100644
6703 --- a/include/linux/cpu.h
6704 +++ b/include/linux/cpu.h
6705 @@ -182,6 +182,8 @@ extern void get_online_cpus(void);
6706  extern void put_online_cpus(void);
6707  extern void cpu_hotplug_disable(void);
6708  extern void cpu_hotplug_enable(void);
6709 +extern void pin_current_cpu(void);
6710 +extern void unpin_current_cpu(void);
6711  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6712  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6713  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6714 @@ -199,6 +201,8 @@ static inline void cpu_hotplug_done(void) {}
6715  #define put_online_cpus()      do { } while (0)
6716  #define cpu_hotplug_disable()  do { } while (0)
6717  #define cpu_hotplug_enable()   do { } while (0)
6718 +static inline void pin_current_cpu(void) { }
6719 +static inline void unpin_current_cpu(void) { }
6720  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6721  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6722  /* These aren't inline functions due to a GCC bug. */
6723 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
6724 index 5beed7b30561..61cab7ef458e 100644
6725 --- a/include/linux/dcache.h
6726 +++ b/include/linux/dcache.h
6727 @@ -11,6 +11,7 @@
6728  #include <linux/rcupdate.h>
6729  #include <linux/lockref.h>
6730  #include <linux/stringhash.h>
6731 +#include <linux/wait.h>
6732
6733  struct path;
6734  struct vfsmount;
6735 @@ -100,7 +101,7 @@ struct dentry {
6736
6737         union {
6738                 struct list_head d_lru;         /* LRU list */
6739 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6740 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6741         };
6742         struct list_head d_child;       /* child of parent list */
6743         struct list_head d_subdirs;     /* our children */
6744 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
6745  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6746  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6747  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6748 -                                       wait_queue_head_t *);
6749 +                                       struct swait_queue_head *);
6750  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6751  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6752  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6753 diff --git a/include/linux/delay.h b/include/linux/delay.h
6754 index a6ecb34cf547..37caab306336 100644
6755 --- a/include/linux/delay.h
6756 +++ b/include/linux/delay.h
6757 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
6758         msleep(seconds * 1000);
6759  }
6760
6761 +#ifdef CONFIG_PREEMPT_RT_FULL
6762 +extern void cpu_chill(void);
6763 +#else
6764 +# define cpu_chill()   cpu_relax()
6765 +#endif
6766 +
6767  #endif /* defined(_LINUX_DELAY_H) */
6768 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
6769 index bb3f3297062a..a117a33ef72c 100644
6770 --- a/include/linux/highmem.h
6771 +++ b/include/linux/highmem.h
6772 @@ -7,6 +7,7 @@
6773  #include <linux/mm.h>
6774  #include <linux/uaccess.h>
6775  #include <linux/hardirq.h>
6776 +#include <linux/sched.h>
6777
6778  #include <asm/cacheflush.h>
6779
6780 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
6781
6782  static inline void *kmap_atomic(struct page *page)
6783  {
6784 -       preempt_disable();
6785 +       preempt_disable_nort();
6786         pagefault_disable();
6787         return page_address(page);
6788  }
6789 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
6790  static inline void __kunmap_atomic(void *addr)
6791  {
6792         pagefault_enable();
6793 -       preempt_enable();
6794 +       preempt_enable_nort();
6795  }
6796
6797  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
6798 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
6799
6800  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6801
6802 +#ifndef CONFIG_PREEMPT_RT_FULL
6803  DECLARE_PER_CPU(int, __kmap_atomic_idx);
6804 +#endif
6805
6806  static inline int kmap_atomic_idx_push(void)
6807  {
6808 +#ifndef CONFIG_PREEMPT_RT_FULL
6809         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6810
6811 -#ifdef CONFIG_DEBUG_HIGHMEM
6812 +# ifdef CONFIG_DEBUG_HIGHMEM
6813         WARN_ON_ONCE(in_irq() && !irqs_disabled());
6814         BUG_ON(idx >= KM_TYPE_NR);
6815 -#endif
6816 +# endif
6817         return idx;
6818 +#else
6819 +       current->kmap_idx++;
6820 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
6821 +       return current->kmap_idx - 1;
6822 +#endif
6823  }
6824
6825  static inline int kmap_atomic_idx(void)
6826  {
6827 +#ifndef CONFIG_PREEMPT_RT_FULL
6828         return __this_cpu_read(__kmap_atomic_idx) - 1;
6829 +#else
6830 +       return current->kmap_idx - 1;
6831 +#endif
6832  }
6833
6834  static inline void kmap_atomic_idx_pop(void)
6835  {
6836 -#ifdef CONFIG_DEBUG_HIGHMEM
6837 +#ifndef CONFIG_PREEMPT_RT_FULL
6838 +# ifdef CONFIG_DEBUG_HIGHMEM
6839         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6840
6841         BUG_ON(idx < 0);
6842 -#else
6843 +# else
6844         __this_cpu_dec(__kmap_atomic_idx);
6845 +# endif
6846 +#else
6847 +       current->kmap_idx--;
6848 +# ifdef CONFIG_DEBUG_HIGHMEM
6849 +       BUG_ON(current->kmap_idx < 0);
6850 +# endif
6851  #endif
6852  }
6853
6854 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
6855 index 5e00f80b1535..a34e10b55cde 100644
6856 --- a/include/linux/hrtimer.h
6857 +++ b/include/linux/hrtimer.h
6858 @@ -87,6 +87,9 @@ enum hrtimer_restart {
6859   * @function:  timer expiry callback function
6860   * @base:      pointer to the timer base (per cpu and per clock)
6861   * @state:     state information (See bit values above)
6862 + * @cb_entry:  list entry to defer timers from hardirq context
6863 + * @irqsafe:   timer can run in hardirq context
6864 + * @praecox:   timer expiry time if expired at the time of programming
6865   * @is_rel:    Set if the timer was armed relative
6866   * @start_pid:  timer statistics field to store the pid of the task which
6867   *             started the timer
6868 @@ -103,6 +106,11 @@ struct hrtimer {
6869         enum hrtimer_restart            (*function)(struct hrtimer *);
6870         struct hrtimer_clock_base       *base;
6871         u8                              state;
6872 +       struct list_head                cb_entry;
6873 +       int                             irqsafe;
6874 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6875 +       ktime_t                         praecox;
6876 +#endif
6877         u8                              is_rel;
6878  #ifdef CONFIG_TIMER_STATS
6879         int                             start_pid;
6880 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
6881         struct task_struct *task;
6882  };
6883
6884 -#ifdef CONFIG_64BIT
6885  # define HRTIMER_CLOCK_BASE_ALIGN      64
6886 -#else
6887 -# define HRTIMER_CLOCK_BASE_ALIGN      32
6888 -#endif
6889
6890  /**
6891   * struct hrtimer_clock_base - the timer base for a specific clock
6892 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
6893   *                     timer to a base on another cpu.
6894   * @clockid:           clock id for per_cpu support
6895   * @active:            red black tree root node for the active timers
6896 + * @expired:           list head for deferred timers.
6897   * @get_time:          function to retrieve the current time of the clock
6898   * @offset:            offset of this clock to the monotonic base
6899   */
6900 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
6901         int                     index;
6902         clockid_t               clockid;
6903         struct timerqueue_head  active;
6904 +       struct list_head        expired;
6905         ktime_t                 (*get_time)(void);
6906         ktime_t                 offset;
6907  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6908 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
6909         raw_spinlock_t                  lock;
6910         seqcount_t                      seq;
6911         struct hrtimer                  *running;
6912 +       struct hrtimer                  *running_soft;
6913         unsigned int                    cpu;
6914         unsigned int                    active_bases;
6915         unsigned int                    clock_was_set_seq;
6916 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
6917         unsigned int                    nr_hangs;
6918         unsigned int                    max_hang_time;
6919  #endif
6920 +#ifdef CONFIG_PREEMPT_RT_BASE
6921 +       wait_queue_head_t               wait;
6922 +#endif
6923         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
6924  } ____cacheline_aligned;
6925
6926 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
6927         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6928  }
6929
6930 +/* Softirq preemption could deadlock timer removal */
6931 +#ifdef CONFIG_PREEMPT_RT_BASE
6932 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6933 +#else
6934 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6935 +#endif
6936 +
6937  /* Query timers: */
6938  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6939
6940 @@ -436,9 +453,15 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
6941   * Helper function to check, whether the timer is running the callback
6942   * function
6943   */
6944 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6945 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6946  {
6947 -       return timer->base->cpu_base->running == timer;
6948 +       if (timer->base->cpu_base->running == timer)
6949 +               return 1;
6950 +#ifdef CONFIG_PREEMPT_RT_BASE
6951 +       if (timer->base->cpu_base->running_soft == timer)
6952 +               return 1;
6953 +#endif
6954 +       return 0;
6955  }
6956
6957  /* Forward a hrtimer so it expires after now: */
6958 diff --git a/include/linux/idr.h b/include/linux/idr.h
6959 index 083d61e92706..5899796f50cb 100644
6960 --- a/include/linux/idr.h
6961 +++ b/include/linux/idr.h
6962 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
6963   * Each idr_preload() should be matched with an invocation of this
6964   * function.  See idr_preload() for details.
6965   */
6966 +#ifdef CONFIG_PREEMPT_RT_FULL
6967 +void idr_preload_end(void);
6968 +#else
6969  static inline void idr_preload_end(void)
6970  {
6971         preempt_enable();
6972  }
6973 +#endif
6974
6975  /**
6976   * idr_find - return pointer for given id
6977 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
6978 index 325f649d77ff..a56e263f5005 100644
6979 --- a/include/linux/init_task.h
6980 +++ b/include/linux/init_task.h
6981 @@ -150,6 +150,12 @@ extern struct task_group root_task_group;
6982  # define INIT_PERF_EVENTS(tsk)
6983  #endif
6984
6985 +#ifdef CONFIG_PREEMPT_RT_BASE
6986 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
6987 +#else
6988 +# define INIT_TIMER_LIST
6989 +#endif
6990 +
6991  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6992  # define INIT_VTIME(tsk)                                               \
6993         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
6994 @@ -164,6 +170,7 @@ extern struct task_group root_task_group;
6995  #ifdef CONFIG_RT_MUTEXES
6996  # define INIT_RT_MUTEXES(tsk)                                          \
6997         .pi_waiters = RB_ROOT,                                          \
6998 +       .pi_top_task = NULL,                                            \
6999         .pi_waiters_leftmost = NULL,
7000  #else
7001  # define INIT_RT_MUTEXES(tsk)
7002 @@ -250,6 +257,7 @@ extern struct task_group root_task_group;
7003         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
7004         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
7005         .timer_slack_ns = 50000, /* 50 usec default slack */            \
7006 +       INIT_TIMER_LIST                                                 \
7007         .pids = {                                                       \
7008                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
7009                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
7010 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
7011 index 72f0721f75e7..480972ae47d3 100644
7012 --- a/include/linux/interrupt.h
7013 +++ b/include/linux/interrupt.h
7014 @@ -14,6 +14,7 @@
7015  #include <linux/hrtimer.h>
7016  #include <linux/kref.h>
7017  #include <linux/workqueue.h>
7018 +#include <linux/swork.h>
7019
7020  #include <linux/atomic.h>
7021  #include <asm/ptrace.h>
7022 @@ -61,6 +62,7 @@
7023   *                interrupt handler after suspending interrupts. For system
7024   *                wakeup devices users need to implement wakeup detection in
7025   *                their interrupt handlers.
7026 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
7027   */
7028  #define IRQF_SHARED            0x00000080
7029  #define IRQF_PROBE_SHARED      0x00000100
7030 @@ -74,6 +76,7 @@
7031  #define IRQF_NO_THREAD         0x00010000
7032  #define IRQF_EARLY_RESUME      0x00020000
7033  #define IRQF_COND_SUSPEND      0x00040000
7034 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
7035
7036  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
7037
7038 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
7039  #ifdef CONFIG_LOCKDEP
7040  # define local_irq_enable_in_hardirq() do { } while (0)
7041  #else
7042 -# define local_irq_enable_in_hardirq() local_irq_enable()
7043 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
7044  #endif
7045
7046  extern void disable_irq_nosync(unsigned int irq);
7047 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
7048   * struct irq_affinity_notify - context for notification of IRQ affinity changes
7049   * @irq:               Interrupt to which notification applies
7050   * @kref:              Reference count, for internal use
7051 + * @swork:             Swork item, for internal use
7052   * @work:              Work item, for internal use
7053   * @notify:            Function to be called on change.  This will be
7054   *                     called in process context.
7055 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
7056  struct irq_affinity_notify {
7057         unsigned int irq;
7058         struct kref kref;
7059 +#ifdef CONFIG_PREEMPT_RT_BASE
7060 +       struct swork_event swork;
7061 +#else
7062         struct work_struct work;
7063 +#endif
7064         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
7065         void (*release)(struct kref *ref);
7066  };
7067 @@ -406,9 +414,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
7068                                  bool state);
7069
7070  #ifdef CONFIG_IRQ_FORCED_THREADING
7071 +# ifndef CONFIG_PREEMPT_RT_BASE
7072  extern bool force_irqthreads;
7073 +# else
7074 +#  define force_irqthreads     (true)
7075 +# endif
7076  #else
7077 -#define force_irqthreads       (0)
7078 +#define force_irqthreads       (false)
7079  #endif
7080
7081  #ifndef __ARCH_SET_SOFTIRQ_PENDING
7082 @@ -465,9 +477,10 @@ struct softirq_action
7083         void    (*action)(struct softirq_action *);
7084  };
7085
7086 +#ifndef CONFIG_PREEMPT_RT_FULL
7087  asmlinkage void do_softirq(void);
7088  asmlinkage void __do_softirq(void);
7089 -
7090 +static inline void thread_do_softirq(void) { do_softirq(); }
7091  #ifdef __ARCH_HAS_DO_SOFTIRQ
7092  void do_softirq_own_stack(void);
7093  #else
7094 @@ -476,13 +489,25 @@ static inline void do_softirq_own_stack(void)
7095         __do_softirq();
7096  }
7097  #endif
7098 +#else
7099 +extern void thread_do_softirq(void);
7100 +#endif
7101
7102  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
7103  extern void softirq_init(void);
7104  extern void __raise_softirq_irqoff(unsigned int nr);
7105 +#ifdef CONFIG_PREEMPT_RT_FULL
7106 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
7107 +#else
7108 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
7109 +{
7110 +       __raise_softirq_irqoff(nr);
7111 +}
7112 +#endif
7113
7114  extern void raise_softirq_irqoff(unsigned int nr);
7115  extern void raise_softirq(unsigned int nr);
7116 +extern void softirq_check_pending_idle(void);
7117
7118  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
7119
7120 @@ -504,8 +529,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
7121       to be executed on some cpu at least once after this.
7122     * If the tasklet is already scheduled, but its execution is still not
7123       started, it will be executed only once.
7124 -   * If this tasklet is already running on another CPU (or schedule is called
7125 -     from tasklet itself), it is rescheduled for later.
7126 +   * If this tasklet is already running on another CPU, it is rescheduled
7127 +     for later.
7128 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
7129     * Tasklet is strictly serialized wrt itself, but not
7130       wrt another tasklets. If client needs some intertask synchronization,
7131       he makes it with spinlocks.
7132 @@ -530,27 +556,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
7133  enum
7134  {
7135         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
7136 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
7137 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
7138 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
7139  };
7140
7141 -#ifdef CONFIG_SMP
7142 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
7143 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
7144 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
7145 +
7146 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
7147  static inline int tasklet_trylock(struct tasklet_struct *t)
7148  {
7149         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
7150  }
7151
7152 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
7153 +{
7154 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
7155 +}
7156 +
7157  static inline void tasklet_unlock(struct tasklet_struct *t)
7158  {
7159         smp_mb__before_atomic();
7160         clear_bit(TASKLET_STATE_RUN, &(t)->state);
7161  }
7162
7163 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
7164 -{
7165 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
7166 -}
7167 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
7168 +
7169  #else
7170  #define tasklet_trylock(t) 1
7171 +#define tasklet_tryunlock(t)   1
7172  #define tasklet_unlock_wait(t) do { } while (0)
7173  #define tasklet_unlock(t) do { } while (0)
7174  #endif
7175 @@ -599,12 +634,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
7176         smp_mb();
7177  }
7178
7179 -static inline void tasklet_enable(struct tasklet_struct *t)
7180 -{
7181 -       smp_mb__before_atomic();
7182 -       atomic_dec(&t->count);
7183 -}
7184 -
7185 +extern void tasklet_enable(struct tasklet_struct *t);
7186  extern void tasklet_kill(struct tasklet_struct *t);
7187  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
7188  extern void tasklet_init(struct tasklet_struct *t,
7189 @@ -635,6 +665,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
7190         tasklet_kill(&ttimer->tasklet);
7191  }
7192
7193 +#ifdef CONFIG_PREEMPT_RT_FULL
7194 +extern void softirq_early_init(void);
7195 +#else
7196 +static inline void softirq_early_init(void) { }
7197 +#endif
7198 +
7199  /*
7200   * Autoprobing for irqs:
7201   *
7202 diff --git a/include/linux/irq.h b/include/linux/irq.h
7203 index 39e3254e5769..8ebac94fbb9f 100644
7204 --- a/include/linux/irq.h
7205 +++ b/include/linux/irq.h
7206 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
7207   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
7208   *                               it from the spurious interrupt detection
7209   *                               mechanism and from core side polling.
7210 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
7211   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
7212   */
7213  enum {
7214 @@ -99,13 +100,14 @@ enum {
7215         IRQ_PER_CPU_DEVID       = (1 << 17),
7216         IRQ_IS_POLLED           = (1 << 18),
7217         IRQ_DISABLE_UNLAZY      = (1 << 19),
7218 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
7219  };
7220
7221  #define IRQF_MODIFY_MASK       \
7222         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
7223          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
7224          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
7225 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
7226 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
7227
7228  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
7229
7230 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
7231 index 47b9ebd4a74f..2543aab05daa 100644
7232 --- a/include/linux/irq_work.h
7233 +++ b/include/linux/irq_work.h
7234 @@ -16,6 +16,7 @@
7235  #define IRQ_WORK_BUSY          2UL
7236  #define IRQ_WORK_FLAGS         3UL
7237  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
7238 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
7239
7240  struct irq_work {
7241         unsigned long flags;
7242 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
7243  static inline void irq_work_run(void) { }
7244  #endif
7245
7246 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
7247 +void irq_work_tick_soft(void);
7248 +#else
7249 +static inline void irq_work_tick_soft(void) { }
7250 +#endif
7251 +
7252  #endif /* _LINUX_IRQ_WORK_H */
7253 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
7254 index c9be57931b58..eeeb540971ae 100644
7255 --- a/include/linux/irqdesc.h
7256 +++ b/include/linux/irqdesc.h
7257 @@ -66,6 +66,7 @@ struct irq_desc {
7258         unsigned int            irqs_unhandled;
7259         atomic_t                threads_handled;
7260         int                     threads_handled_last;
7261 +       u64                     random_ip;
7262         raw_spinlock_t          lock;
7263         struct cpumask          *percpu_enabled;
7264         const struct cpumask    *percpu_affinity;
7265 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
7266 index 5dd1272d1ab2..9b77034f7c5e 100644
7267 --- a/include/linux/irqflags.h
7268 +++ b/include/linux/irqflags.h
7269 @@ -25,8 +25,6 @@
7270  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
7271  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
7272  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
7273 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
7274 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
7275  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
7276  #else
7277  # define trace_hardirqs_on()           do { } while (0)
7278 @@ -39,9 +37,15 @@
7279  # define trace_softirqs_enabled(p)     0
7280  # define trace_hardirq_enter()         do { } while (0)
7281  # define trace_hardirq_exit()          do { } while (0)
7282 +# define INIT_TRACE_IRQFLAGS
7283 +#endif
7284 +
7285 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
7286 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
7287 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
7288 +#else
7289  # define lockdep_softirq_enter()       do { } while (0)
7290  # define lockdep_softirq_exit()                do { } while (0)
7291 -# define INIT_TRACE_IRQFLAGS
7292  #endif
7293
7294  #if defined(CONFIG_IRQSOFF_TRACER) || \
7295 @@ -148,4 +152,23 @@
7296
7297  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
7298
7299 +/*
7300 + * local_irq* variants depending on RT/!RT
7301 + */
7302 +#ifdef CONFIG_PREEMPT_RT_FULL
7303 +# define local_irq_disable_nort()      do { } while (0)
7304 +# define local_irq_enable_nort()       do { } while (0)
7305 +# define local_irq_save_nort(flags)    local_save_flags(flags)
7306 +# define local_irq_restore_nort(flags) (void)(flags)
7307 +# define local_irq_disable_rt()                local_irq_disable()
7308 +# define local_irq_enable_rt()         local_irq_enable()
7309 +#else
7310 +# define local_irq_disable_nort()      local_irq_disable()
7311 +# define local_irq_enable_nort()       local_irq_enable()
7312 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7313 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7314 +# define local_irq_disable_rt()                do { } while (0)
7315 +# define local_irq_enable_rt()         do { } while (0)
7316 +#endif
7317 +
7318  #endif
7319 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
7320 index dfaa1f4dcb0c..d57dd06544a1 100644
7321 --- a/include/linux/jbd2.h
7322 +++ b/include/linux/jbd2.h
7323 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
7324
7325  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7326  {
7327 +#ifndef CONFIG_PREEMPT_RT_BASE
7328         bit_spin_lock(BH_State, &bh->b_state);
7329 +#else
7330 +       spin_lock(&bh->b_state_lock);
7331 +#endif
7332  }
7333
7334  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7335  {
7336 +#ifndef CONFIG_PREEMPT_RT_BASE
7337         return bit_spin_trylock(BH_State, &bh->b_state);
7338 +#else
7339 +       return spin_trylock(&bh->b_state_lock);
7340 +#endif
7341  }
7342
7343  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7344  {
7345 +#ifndef CONFIG_PREEMPT_RT_BASE
7346         return bit_spin_is_locked(BH_State, &bh->b_state);
7347 +#else
7348 +       return spin_is_locked(&bh->b_state_lock);
7349 +#endif
7350  }
7351
7352  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7353  {
7354 +#ifndef CONFIG_PREEMPT_RT_BASE
7355         bit_spin_unlock(BH_State, &bh->b_state);
7356 +#else
7357 +       spin_unlock(&bh->b_state_lock);
7358 +#endif
7359  }
7360
7361  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7362  {
7363 +#ifndef CONFIG_PREEMPT_RT_BASE
7364         bit_spin_lock(BH_JournalHead, &bh->b_state);
7365 +#else
7366 +       spin_lock(&bh->b_journal_head_lock);
7367 +#endif
7368  }
7369
7370  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7371  {
7372 +#ifndef CONFIG_PREEMPT_RT_BASE
7373         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7374 +#else
7375 +       spin_unlock(&bh->b_journal_head_lock);
7376 +#endif
7377  }
7378
7379  #define J_ASSERT(assert)       BUG_ON(!(assert))
7380 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
7381 index 410decacff8f..0861bebfc188 100644
7382 --- a/include/linux/kdb.h
7383 +++ b/include/linux/kdb.h
7384 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
7385  extern __printf(1, 2) int kdb_printf(const char *, ...);
7386  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7387
7388 +#define in_kdb_printk()        (kdb_trap_printk)
7389  extern void kdb_init(int level);
7390
7391  /* Access to kdb specific polling devices */
7392 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
7393  extern int kdb_unregister(char *);
7394  #else /* ! CONFIG_KGDB_KDB */
7395  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7396 +#define in_kdb_printk() (0)
7397  static inline void kdb_init(int level) {}
7398  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7399                                char *help, short minlen) { return 0; }
7400 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
7401 index bc6ed52a39b9..7894d55e4998 100644
7402 --- a/include/linux/kernel.h
7403 +++ b/include/linux/kernel.h
7404 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
7405   */
7406  # define might_sleep() \
7407         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7408 +
7409 +# define might_sleep_no_state_check() \
7410 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7411  # define sched_annotate_sleep()        (current->task_state_change = 0)
7412  #else
7413    static inline void ___might_sleep(const char *file, int line,
7414 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
7415    static inline void __might_sleep(const char *file, int line,
7416                                    int preempt_offset) { }
7417  # define might_sleep() do { might_resched(); } while (0)
7418 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7419  # define sched_annotate_sleep() do { } while (0)
7420  #endif
7421
7422 @@ -488,6 +492,7 @@ extern enum system_states {
7423         SYSTEM_HALT,
7424         SYSTEM_POWER_OFF,
7425         SYSTEM_RESTART,
7426 +       SYSTEM_SUSPEND,
7427  } system_state;
7428
7429  #define TAINT_PROPRIETARY_MODULE       0
7430 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
7431 index cb483305e1f5..4e5062316bb6 100644
7432 --- a/include/linux/list_bl.h
7433 +++ b/include/linux/list_bl.h
7434 @@ -2,6 +2,7 @@
7435  #define _LINUX_LIST_BL_H
7436
7437  #include <linux/list.h>
7438 +#include <linux/spinlock.h>
7439  #include <linux/bit_spinlock.h>
7440
7441  /*
7442 @@ -32,13 +33,24 @@
7443
7444  struct hlist_bl_head {
7445         struct hlist_bl_node *first;
7446 +#ifdef CONFIG_PREEMPT_RT_BASE
7447 +       raw_spinlock_t lock;
7448 +#endif
7449  };
7450
7451  struct hlist_bl_node {
7452         struct hlist_bl_node *next, **pprev;
7453  };
7454 -#define INIT_HLIST_BL_HEAD(ptr) \
7455 -       ((ptr)->first = NULL)
7456 +
7457 +#ifdef CONFIG_PREEMPT_RT_BASE
7458 +#define INIT_HLIST_BL_HEAD(h)          \
7459 +do {                                   \
7460 +       (h)->first = NULL;              \
7461 +       raw_spin_lock_init(&(h)->lock); \
7462 +} while (0)
7463 +#else
7464 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7465 +#endif
7466
7467  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7468  {
7469 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
7470
7471  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7472  {
7473 +#ifndef CONFIG_PREEMPT_RT_BASE
7474         bit_spin_lock(0, (unsigned long *)b);
7475 +#else
7476 +       raw_spin_lock(&b->lock);
7477 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7478 +       __set_bit(0, (unsigned long *)b);
7479 +#endif
7480 +#endif
7481  }
7482
7483  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7484  {
7485 +#ifndef CONFIG_PREEMPT_RT_BASE
7486         __bit_spin_unlock(0, (unsigned long *)b);
7487 +#else
7488 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7489 +       __clear_bit(0, (unsigned long *)b);
7490 +#endif
7491 +       raw_spin_unlock(&b->lock);
7492 +#endif
7493  }
7494
7495  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7496 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
7497 new file mode 100644
7498 index 000000000000..845c77f1a5ca
7499 --- /dev/null
7500 +++ b/include/linux/locallock.h
7501 @@ -0,0 +1,278 @@
7502 +#ifndef _LINUX_LOCALLOCK_H
7503 +#define _LINUX_LOCALLOCK_H
7504 +
7505 +#include <linux/percpu.h>
7506 +#include <linux/spinlock.h>
7507 +
7508 +#ifdef CONFIG_PREEMPT_RT_BASE
7509 +
7510 +#ifdef CONFIG_DEBUG_SPINLOCK
7511 +# define LL_WARN(cond) WARN_ON(cond)
7512 +#else
7513 +# define LL_WARN(cond) do { } while (0)
7514 +#endif
7515 +
7516 +/*
7517 + * per cpu lock based substitute for local_irq_*()
7518 + */
7519 +struct local_irq_lock {
7520 +       spinlock_t              lock;
7521 +       struct task_struct      *owner;
7522 +       int                     nestcnt;
7523 +       unsigned long           flags;
7524 +};
7525 +
7526 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7527 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7528 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7529 +
7530 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7531 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7532 +
7533 +#define local_irq_lock_init(lvar)                                      \
7534 +       do {                                                            \
7535 +               int __cpu;                                              \
7536 +               for_each_possible_cpu(__cpu)                            \
7537 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7538 +       } while (0)
7539 +
7540 +/*
7541 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7542 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7543 + * already takes care of the migrate_disable/enable
7544 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7545 + */
7546 +#ifdef CONFIG_PREEMPT_RT_FULL
7547 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7548 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7549 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7550 +#else
7551 +# define spin_lock_local(lock)                 spin_lock(lock)
7552 +# define spin_trylock_local(lock)              spin_trylock(lock)
7553 +# define spin_unlock_local(lock)               spin_unlock(lock)
7554 +#endif
7555 +
7556 +static inline void __local_lock(struct local_irq_lock *lv)
7557 +{
7558 +       if (lv->owner != current) {
7559 +               spin_lock_local(&lv->lock);
7560 +               LL_WARN(lv->owner);
7561 +               LL_WARN(lv->nestcnt);
7562 +               lv->owner = current;
7563 +       }
7564 +       lv->nestcnt++;
7565 +}
7566 +
7567 +#define local_lock(lvar)                                       \
7568 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7569 +
7570 +#define local_lock_on(lvar, cpu)                               \
7571 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7572 +
7573 +static inline int __local_trylock(struct local_irq_lock *lv)
7574 +{
7575 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7576 +               LL_WARN(lv->owner);
7577 +               LL_WARN(lv->nestcnt);
7578 +               lv->owner = current;
7579 +               lv->nestcnt = 1;
7580 +               return 1;
7581 +       }
7582 +       return 0;
7583 +}
7584 +
7585 +#define local_trylock(lvar)                                            \
7586 +       ({                                                              \
7587 +               int __locked;                                           \
7588 +               __locked = __local_trylock(&get_local_var(lvar));       \
7589 +               if (!__locked)                                          \
7590 +                       put_local_var(lvar);                            \
7591 +               __locked;                                               \
7592 +       })
7593 +
7594 +static inline void __local_unlock(struct local_irq_lock *lv)
7595 +{
7596 +       LL_WARN(lv->nestcnt == 0);
7597 +       LL_WARN(lv->owner != current);
7598 +       if (--lv->nestcnt)
7599 +               return;
7600 +
7601 +       lv->owner = NULL;
7602 +       spin_unlock_local(&lv->lock);
7603 +}
7604 +
7605 +#define local_unlock(lvar)                                     \
7606 +       do {                                                    \
7607 +               __local_unlock(this_cpu_ptr(&lvar));            \
7608 +               put_local_var(lvar);                            \
7609 +       } while (0)
7610 +
7611 +#define local_unlock_on(lvar, cpu)                       \
7612 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7613 +
7614 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7615 +{
7616 +       spin_lock_irqsave(&lv->lock, lv->flags);
7617 +       LL_WARN(lv->owner);
7618 +       LL_WARN(lv->nestcnt);
7619 +       lv->owner = current;
7620 +       lv->nestcnt = 1;
7621 +}
7622 +
7623 +#define local_lock_irq(lvar)                                           \
7624 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7625 +
7626 +#define local_lock_irq_on(lvar, cpu)                                   \
7627 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7628 +
7629 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7630 +{
7631 +       LL_WARN(!lv->nestcnt);
7632 +       LL_WARN(lv->owner != current);
7633 +       lv->owner = NULL;
7634 +       lv->nestcnt = 0;
7635 +       spin_unlock_irq(&lv->lock);
7636 +}
7637 +
7638 +#define local_unlock_irq(lvar)                                         \
7639 +       do {                                                            \
7640 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7641 +               put_local_var(lvar);                                    \
7642 +       } while (0)
7643 +
7644 +#define local_unlock_irq_on(lvar, cpu)                                 \
7645 +       do {                                                            \
7646 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7647 +       } while (0)
7648 +
7649 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7650 +{
7651 +       if (lv->owner != current) {
7652 +               __local_lock_irq(lv);
7653 +               return 0;
7654 +       } else {
7655 +               lv->nestcnt++;
7656 +               return 1;
7657 +       }
7658 +}
7659 +
7660 +#define local_lock_irqsave(lvar, _flags)                               \
7661 +       do {                                                            \
7662 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7663 +                       put_local_var(lvar);                            \
7664 +               _flags = __this_cpu_read(lvar.flags);                   \
7665 +       } while (0)
7666 +
7667 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7668 +       do {                                                            \
7669 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7670 +               _flags = per_cpu(lvar, cpu).flags;                      \
7671 +       } while (0)
7672 +
7673 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7674 +                                           unsigned long flags)
7675 +{
7676 +       LL_WARN(!lv->nestcnt);
7677 +       LL_WARN(lv->owner != current);
7678 +       if (--lv->nestcnt)
7679 +               return 0;
7680 +
7681 +       lv->owner = NULL;
7682 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7683 +       return 1;
7684 +}
7685 +
7686 +#define local_unlock_irqrestore(lvar, flags)                           \
7687 +       do {                                                            \
7688 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7689 +                       put_local_var(lvar);                            \
7690 +       } while (0)
7691 +
7692 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7693 +       do {                                                            \
7694 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7695 +       } while (0)
7696 +
7697 +#define local_spin_trylock_irq(lvar, lock)                             \
7698 +       ({                                                              \
7699 +               int __locked;                                           \
7700 +               local_lock_irq(lvar);                                   \
7701 +               __locked = spin_trylock(lock);                          \
7702 +               if (!__locked)                                          \
7703 +                       local_unlock_irq(lvar);                         \
7704 +               __locked;                                               \
7705 +       })
7706 +
7707 +#define local_spin_lock_irq(lvar, lock)                                        \
7708 +       do {                                                            \
7709 +               local_lock_irq(lvar);                                   \
7710 +               spin_lock(lock);                                        \
7711 +       } while (0)
7712 +
7713 +#define local_spin_unlock_irq(lvar, lock)                              \
7714 +       do {                                                            \
7715 +               spin_unlock(lock);                                      \
7716 +               local_unlock_irq(lvar);                                 \
7717 +       } while (0)
7718 +
7719 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7720 +       do {                                                            \
7721 +               local_lock_irqsave(lvar, flags);                        \
7722 +               spin_lock(lock);                                        \
7723 +       } while (0)
7724 +
7725 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7726 +       do {                                                            \
7727 +               spin_unlock(lock);                                      \
7728 +               local_unlock_irqrestore(lvar, flags);                   \
7729 +       } while (0)
7730 +
7731 +#define get_locked_var(lvar, var)                                      \
7732 +       (*({                                                            \
7733 +               local_lock(lvar);                                       \
7734 +               this_cpu_ptr(&var);                                     \
7735 +       }))
7736 +
7737 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7738 +
7739 +#define local_lock_cpu(lvar)                                           \
7740 +       ({                                                              \
7741 +               local_lock(lvar);                                       \
7742 +               smp_processor_id();                                     \
7743 +       })
7744 +
7745 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7746 +
7747 +#else /* PREEMPT_RT_BASE */
7748 +
7749 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7750 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7751 +
7752 +static inline void local_irq_lock_init(int lvar) { }
7753 +
7754 +#define local_lock(lvar)                       preempt_disable()
7755 +#define local_unlock(lvar)                     preempt_enable()
7756 +#define local_lock_irq(lvar)                   local_irq_disable()
7757 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7758 +#define local_unlock_irq(lvar)                 local_irq_enable()
7759 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7760 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7761 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7762 +
7763 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7764 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7765 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7766 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7767 +       spin_lock_irqsave(lock, flags)
7768 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7769 +       spin_unlock_irqrestore(lock, flags)
7770 +
7771 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7772 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7773 +
7774 +#define local_lock_cpu(lvar)                   get_cpu()
7775 +#define local_unlock_cpu(lvar)                 put_cpu()
7776 +
7777 +#endif
7778 +
7779 +#endif
7780 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
7781 index 08d947fc4c59..705fb564a605 100644
7782 --- a/include/linux/mm_types.h
7783 +++ b/include/linux/mm_types.h
7784 @@ -11,6 +11,7 @@
7785  #include <linux/completion.h>
7786  #include <linux/cpumask.h>
7787  #include <linux/uprobes.h>
7788 +#include <linux/rcupdate.h>
7789  #include <linux/page-flags-layout.h>
7790  #include <linux/workqueue.h>
7791  #include <asm/page.h>
7792 @@ -509,6 +510,9 @@ struct mm_struct {
7793         bool tlb_flush_pending;
7794  #endif
7795         struct uprobes_state uprobes_state;
7796 +#ifdef CONFIG_PREEMPT_RT_BASE
7797 +       struct rcu_head delayed_drop;
7798 +#endif
7799  #ifdef CONFIG_X86_INTEL_MPX
7800         /* address of the bounds directory */
7801         void __user *bd_addr;
7802 diff --git a/include/linux/module.h b/include/linux/module.h
7803 index 0c3207d26ac0..5944baaa3f28 100644
7804 --- a/include/linux/module.h
7805 +++ b/include/linux/module.h
7806 @@ -496,6 +496,7 @@ static inline int module_is_live(struct module *mod)
7807  struct module *__module_text_address(unsigned long addr);
7808  struct module *__module_address(unsigned long addr);
7809  bool is_module_address(unsigned long addr);
7810 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
7811  bool is_module_percpu_address(unsigned long addr);
7812  bool is_module_text_address(unsigned long addr);
7813
7814 @@ -663,6 +664,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
7815         return false;
7816  }
7817
7818 +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
7819 +{
7820 +       return false;
7821 +}
7822 +
7823  static inline bool is_module_text_address(unsigned long addr)
7824  {
7825         return false;
7826 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
7827 index 2cb7531e7d7a..b3fdfc820216 100644
7828 --- a/include/linux/mutex.h
7829 +++ b/include/linux/mutex.h
7830 @@ -19,6 +19,17 @@
7831  #include <asm/processor.h>
7832  #include <linux/osq_lock.h>
7833
7834 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7835 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7836 +       , .dep_map = { .name = #lockname }
7837 +#else
7838 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7839 +#endif
7840 +
7841 +#ifdef CONFIG_PREEMPT_RT_FULL
7842 +# include <linux/mutex_rt.h>
7843 +#else
7844 +
7845  /*
7846   * Simple, straightforward mutexes with strict semantics:
7847   *
7848 @@ -99,13 +110,6 @@ do {                                                        \
7849  static inline void mutex_destroy(struct mutex *lock) {}
7850  #endif
7851
7852 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7853 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7854 -               , .dep_map = { .name = #lockname }
7855 -#else
7856 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7857 -#endif
7858 -
7859  #define __MUTEX_INITIALIZER(lockname) \
7860                 { .count = ATOMIC_INIT(1) \
7861                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7862 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
7863  extern int mutex_trylock(struct mutex *lock);
7864  extern void mutex_unlock(struct mutex *lock);
7865
7866 +#endif /* !PREEMPT_RT_FULL */
7867 +
7868  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7869
7870  #endif /* __LINUX_MUTEX_H */
7871 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
7872 new file mode 100644
7873 index 000000000000..e0284edec655
7874 --- /dev/null
7875 +++ b/include/linux/mutex_rt.h
7876 @@ -0,0 +1,89 @@
7877 +#ifndef __LINUX_MUTEX_RT_H
7878 +#define __LINUX_MUTEX_RT_H
7879 +
7880 +#ifndef __LINUX_MUTEX_H
7881 +#error "Please include mutex.h"
7882 +#endif
7883 +
7884 +#include <linux/rtmutex.h>
7885 +
7886 +/* FIXME: Just for __lockfunc */
7887 +#include <linux/spinlock.h>
7888 +
7889 +struct mutex {
7890 +       struct rt_mutex         lock;
7891 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7892 +       struct lockdep_map      dep_map;
7893 +#endif
7894 +};
7895 +
7896 +#define __MUTEX_INITIALIZER(mutexname)                                 \
7897 +       {                                                               \
7898 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
7899 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
7900 +       }
7901 +
7902 +#define DEFINE_MUTEX(mutexname)                                                \
7903 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7904 +
7905 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7906 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7907 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7908 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7909 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7910 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7911 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7912 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7913 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7914 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7915 +
7916 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
7917 +#define mutex_lock(l)                  _mutex_lock(l)
7918 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
7919 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
7920 +#define mutex_trylock(l)               _mutex_trylock(l)
7921 +#define mutex_unlock(l)                        _mutex_unlock(l)
7922 +
7923 +#ifdef CONFIG_DEBUG_MUTEXES
7924 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
7925 +#else
7926 +static inline void mutex_destroy(struct mutex *lock) {}
7927 +#endif
7928 +
7929 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7930 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
7931 +# define mutex_lock_interruptible_nested(l, s) \
7932 +                                       _mutex_lock_interruptible_nested(l, s)
7933 +# define mutex_lock_killable_nested(l, s) \
7934 +                                       _mutex_lock_killable_nested(l, s)
7935 +
7936 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
7937 +do {                                                                   \
7938 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
7939 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
7940 +} while (0)
7941 +
7942 +#else
7943 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
7944 +# define mutex_lock_interruptible_nested(l, s) \
7945 +                                       _mutex_lock_interruptible(l)
7946 +# define mutex_lock_killable_nested(l, s) \
7947 +                                       _mutex_lock_killable(l)
7948 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7949 +#endif
7950 +
7951 +# define mutex_init(mutex)                             \
7952 +do {                                                   \
7953 +       static struct lock_class_key __key;             \
7954 +                                                       \
7955 +       rt_mutex_init(&(mutex)->lock);                  \
7956 +       __mutex_do_init((mutex), #mutex, &__key);       \
7957 +} while (0)
7958 +
7959 +# define __mutex_init(mutex, name, key)                        \
7960 +do {                                                   \
7961 +       rt_mutex_init(&(mutex)->lock);                  \
7962 +       __mutex_do_init((mutex), name, key);            \
7963 +} while (0)
7964 +
7965 +#endif
7966 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
7967 index bb9b102c15cd..a5b12b8ad196 100644
7968 --- a/include/linux/netdevice.h
7969 +++ b/include/linux/netdevice.h
7970 @@ -396,7 +396,19 @@ typedef enum rx_handler_result rx_handler_result_t;
7971  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7972
7973  void __napi_schedule(struct napi_struct *n);
7974 +
7975 +/*
7976 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7977 + * run as threads, and they can also be preempted (without PREEMPT_RT
7978 + * interrupt threads can not be preempted). Which means that calling
7979 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7980 + * and can corrupt the napi->poll_list.
7981 + */
7982 +#ifdef CONFIG_PREEMPT_RT_FULL
7983 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7984 +#else
7985  void __napi_schedule_irqoff(struct napi_struct *n);
7986 +#endif
7987
7988  static inline bool napi_disable_pending(struct napi_struct *n)
7989  {
7990 @@ -2463,14 +2475,53 @@ void netdev_freemem(struct net_device *dev);
7991  void synchronize_net(void);
7992  int init_dummy_netdev(struct net_device *dev);
7993
7994 -DECLARE_PER_CPU(int, xmit_recursion);
7995  #define XMIT_RECURSION_LIMIT   10
7996 +#ifdef CONFIG_PREEMPT_RT_FULL
7997 +static inline int dev_recursion_level(void)
7998 +{
7999 +       return current->xmit_recursion;
8000 +}
8001 +
8002 +static inline int xmit_rec_read(void)
8003 +{
8004 +       return current->xmit_recursion;
8005 +}
8006 +
8007 +static inline void xmit_rec_inc(void)
8008 +{
8009 +       current->xmit_recursion++;
8010 +}
8011 +
8012 +static inline void xmit_rec_dec(void)
8013 +{
8014 +       current->xmit_recursion--;
8015 +}
8016 +
8017 +#else
8018 +
8019 +DECLARE_PER_CPU(int, xmit_recursion);
8020
8021  static inline int dev_recursion_level(void)
8022  {
8023         return this_cpu_read(xmit_recursion);
8024  }
8025
8026 +static inline int xmit_rec_read(void)
8027 +{
8028 +       return __this_cpu_read(xmit_recursion);
8029 +}
8030 +
8031 +static inline void xmit_rec_inc(void)
8032 +{
8033 +       __this_cpu_inc(xmit_recursion);
8034 +}
8035 +
8036 +static inline void xmit_rec_dec(void)
8037 +{
8038 +       __this_cpu_dec(xmit_recursion);
8039 +}
8040 +#endif
8041 +
8042  struct net_device *dev_get_by_index(struct net *net, int ifindex);
8043  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
8044  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
8045 @@ -2855,6 +2906,7 @@ struct softnet_data {
8046         unsigned int            dropped;
8047         struct sk_buff_head     input_pkt_queue;
8048         struct napi_struct      backlog;
8049 +       struct sk_buff_head     tofree_queue;
8050
8051  };
8052
8053 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
8054 index 2ad1a2b289b5..b4d10155af54 100644
8055 --- a/include/linux/netfilter/x_tables.h
8056 +++ b/include/linux/netfilter/x_tables.h
8057 @@ -4,6 +4,7 @@
8058
8059  #include <linux/netdevice.h>
8060  #include <linux/static_key.h>
8061 +#include <linux/locallock.h>
8062  #include <uapi/linux/netfilter/x_tables.h>
8063
8064  /* Test a struct->invflags and a boolean for inequality */
8065 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
8066   */
8067  DECLARE_PER_CPU(seqcount_t, xt_recseq);
8068
8069 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
8070 +
8071  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
8072   *
8073   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
8074 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
8075  {
8076         unsigned int addend;
8077
8078 +       /* RT protection */
8079 +       local_lock(xt_write_lock);
8080 +
8081         /*
8082          * Low order bit of sequence is set if we already
8083          * called xt_write_recseq_begin().
8084 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
8085         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
8086         smp_wmb();
8087         __this_cpu_add(xt_recseq.sequence, addend);
8088 +       local_unlock(xt_write_lock);
8089  }
8090
8091  /*
8092 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
8093 index 810124b33327..d54ca43d571f 100644
8094 --- a/include/linux/nfs_fs.h
8095 +++ b/include/linux/nfs_fs.h
8096 @@ -165,7 +165,11 @@ struct nfs_inode {
8097
8098         /* Readers: in-flight sillydelete RPC calls */
8099         /* Writers: rmdir */
8100 +#ifdef CONFIG_PREEMPT_RT_BASE
8101 +       struct semaphore        rmdir_sem;
8102 +#else
8103         struct rw_semaphore     rmdir_sem;
8104 +#endif
8105
8106  #if IS_ENABLED(CONFIG_NFS_V4)
8107         struct nfs4_cached_acl  *nfs4_acl;
8108 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
8109 index beb1e10f446e..ebaf2e7bfe29 100644
8110 --- a/include/linux/nfs_xdr.h
8111 +++ b/include/linux/nfs_xdr.h
8112 @@ -1490,7 +1490,7 @@ struct nfs_unlinkdata {
8113         struct nfs_removeargs args;
8114         struct nfs_removeres res;
8115         struct dentry *dentry;
8116 -       wait_queue_head_t wq;
8117 +       struct swait_queue_head wq;
8118         struct rpc_cred *cred;
8119         struct nfs_fattr dir_attr;
8120         long timeout;
8121 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
8122 index 4149868de4e6..babe5b9bcb91 100644
8123 --- a/include/linux/notifier.h
8124 +++ b/include/linux/notifier.h
8125 @@ -6,7 +6,7 @@
8126   *
8127   *                             Alan Cox <Alan.Cox@linux.org>
8128   */
8129 -
8130 +
8131  #ifndef _LINUX_NOTIFIER_H
8132  #define _LINUX_NOTIFIER_H
8133  #include <linux/errno.h>
8134 @@ -42,9 +42,7 @@
8135   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
8136   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
8137   * SRCU notifier chains should be used when the chain will be called very
8138 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
8139 - * chains are slightly more difficult to use because they require special
8140 - * runtime initialization.
8141 + * often but notifier_blocks will seldom be removed.
8142   */
8143
8144  struct notifier_block;
8145 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
8146                 (name)->head = NULL;            \
8147         } while (0)
8148
8149 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
8150 +/* srcu_notifier_heads must be cleaned up dynamically */
8151  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8152  #define srcu_cleanup_notifier_head(name)       \
8153                 cleanup_srcu_struct(&(name)->srcu);
8154 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8155                 .head = NULL }
8156  #define RAW_NOTIFIER_INIT(name)        {                               \
8157                 .head = NULL }
8158 -/* srcu_notifier_heads cannot be initialized statically */
8159 +
8160 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
8161 +       {                                                       \
8162 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
8163 +               .head = NULL,                                   \
8164 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
8165 +       }
8166
8167  #define ATOMIC_NOTIFIER_HEAD(name)                             \
8168         struct atomic_notifier_head name =                      \
8169 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8170         struct raw_notifier_head name =                         \
8171                 RAW_NOTIFIER_INIT(name)
8172
8173 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
8174 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
8175 +                       name##_head_srcu_array);                \
8176 +       mod struct srcu_notifier_head name =                    \
8177 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
8178 +
8179 +#define SRCU_NOTIFIER_HEAD(name)                               \
8180 +       _SRCU_NOTIFIER_HEAD(name, )
8181 +
8182 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
8183 +       _SRCU_NOTIFIER_HEAD(name, static)
8184 +
8185  #ifdef __KERNEL__
8186
8187  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
8188 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
8189
8190  /*
8191   *     Declared notifiers so far. I can imagine quite a few more chains
8192 - *     over time (eg laptop power reset chains, reboot chain (to clean
8193 + *     over time (eg laptop power reset chains, reboot chain (to clean
8194   *     device units up), device [un]mount chain, module load/unload chain,
8195 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
8196 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
8197   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
8198   */
8199 -
8200 +
8201  /* CPU notfiers are defined in include/linux/cpu.h. */
8202
8203  /* netdevice notifiers are defined in include/linux/netdevice.h */
8204 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
8205 index 5b2e6159b744..ea940f451606 100644
8206 --- a/include/linux/percpu-rwsem.h
8207 +++ b/include/linux/percpu-rwsem.h
8208 @@ -4,7 +4,7 @@
8209  #include <linux/atomic.h>
8210  #include <linux/rwsem.h>
8211  #include <linux/percpu.h>
8212 -#include <linux/wait.h>
8213 +#include <linux/swait.h>
8214  #include <linux/rcu_sync.h>
8215  #include <linux/lockdep.h>
8216
8217 @@ -12,7 +12,7 @@ struct percpu_rw_semaphore {
8218         struct rcu_sync         rss;
8219         unsigned int __percpu   *read_count;
8220         struct rw_semaphore     rw_sem;
8221 -       wait_queue_head_t       writer;
8222 +       struct swait_queue_head writer;
8223         int                     readers_block;
8224  };
8225
8226 @@ -22,13 +22,13 @@ static struct percpu_rw_semaphore name = {                          \
8227         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
8228         .read_count = &__percpu_rwsem_rc_##name,                        \
8229         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
8230 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
8231 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
8232  }
8233
8234  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
8235  extern void __percpu_up_read(struct percpu_rw_semaphore *);
8236
8237 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
8238 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8239  {
8240         might_sleep();
8241
8242 @@ -46,16 +46,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
8243         __this_cpu_inc(*sem->read_count);
8244         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
8245                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
8246 -       barrier();
8247         /*
8248 -        * The barrier() prevents the compiler from
8249 +        * The preempt_enable() prevents the compiler from
8250          * bleeding the critical section out.
8251          */
8252 -}
8253 -
8254 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8255 -{
8256 -       percpu_down_read_preempt_disable(sem);
8257         preempt_enable();
8258  }
8259
8260 @@ -82,13 +76,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
8261         return ret;
8262  }
8263
8264 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
8265 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8266  {
8267 -       /*
8268 -        * The barrier() prevents the compiler from
8269 -        * bleeding the critical section out.
8270 -        */
8271 -       barrier();
8272 +       preempt_disable();
8273         /*
8274          * Same as in percpu_down_read().
8275          */
8276 @@ -101,12 +91,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
8277         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
8278  }
8279
8280 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8281 -{
8282 -       preempt_disable();
8283 -       percpu_up_read_preempt_enable(sem);
8284 -}
8285 -
8286  extern void percpu_down_write(struct percpu_rw_semaphore *);
8287  extern void percpu_up_write(struct percpu_rw_semaphore *);
8288
8289 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
8290 index 56939d3f6e53..b988bf40ad3e 100644
8291 --- a/include/linux/percpu.h
8292 +++ b/include/linux/percpu.h
8293 @@ -18,6 +18,35 @@
8294  #define PERCPU_MODULE_RESERVE          0
8295  #endif
8296
8297 +#ifdef CONFIG_PREEMPT_RT_FULL
8298 +
8299 +#define get_local_var(var) (*({        \
8300 +       migrate_disable();      \
8301 +       this_cpu_ptr(&var);     }))
8302 +
8303 +#define put_local_var(var) do {        \
8304 +       (void)&(var);           \
8305 +       migrate_enable();       \
8306 +} while (0)
8307 +
8308 +# define get_local_ptr(var) ({ \
8309 +       migrate_disable();      \
8310 +       this_cpu_ptr(var);      })
8311 +
8312 +# define put_local_ptr(var) do {       \
8313 +       (void)(var);                    \
8314 +       migrate_enable();               \
8315 +} while (0)
8316 +
8317 +#else
8318 +
8319 +#define get_local_var(var)     get_cpu_var(var)
8320 +#define put_local_var(var)     put_cpu_var(var)
8321 +#define get_local_ptr(var)     get_cpu_ptr(var)
8322 +#define put_local_ptr(var)     put_cpu_ptr(var)
8323 +
8324 +#endif
8325 +
8326  /* minimum unit size, also is the maximum supported allocation size */
8327  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
8328
8329 @@ -110,6 +139,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
8330  #endif
8331
8332  extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
8333 +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
8334  extern bool is_kernel_percpu_address(unsigned long addr);
8335
8336  #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
8337 diff --git a/include/linux/pid.h b/include/linux/pid.h
8338 index 23705a53abba..2cc64b779f03 100644
8339 --- a/include/linux/pid.h
8340 +++ b/include/linux/pid.h
8341 @@ -2,6 +2,7 @@
8342  #define _LINUX_PID_H
8343
8344  #include <linux/rcupdate.h>
8345 +#include <linux/atomic.h>
8346
8347  enum pid_type
8348  {
8349 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
8350 index 75e4e30677f1..1cfb1cb72354 100644
8351 --- a/include/linux/preempt.h
8352 +++ b/include/linux/preempt.h
8353 @@ -50,7 +50,11 @@
8354  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8355  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8356
8357 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8358 +#ifndef CONFIG_PREEMPT_RT_FULL
8359 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8360 +#else
8361 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8362 +#endif
8363
8364  /* We use the MSB mostly because its available */
8365  #define PREEMPT_NEED_RESCHED   0x80000000
8366 @@ -59,9 +63,15 @@
8367  #include <asm/preempt.h>
8368
8369  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8370 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8371  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8372                                  | NMI_MASK))
8373 +#ifndef CONFIG_PREEMPT_RT_FULL
8374 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8375 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8376 +#else
8377 +# define softirq_count()       (0UL)
8378 +extern int in_serving_softirq(void);
8379 +#endif
8380
8381  /*
8382   * Are we doing bottom half or hardware interrupt processing?
8383 @@ -72,7 +82,6 @@
8384  #define in_irq()               (hardirq_count())
8385  #define in_softirq()           (softirq_count())
8386  #define in_interrupt()         (irq_count())
8387 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8388
8389  /*
8390   * Are we in NMI context?
8391 @@ -91,7 +100,11 @@
8392  /*
8393   * The preempt_count offset after spin_lock()
8394   */
8395 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8396  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8397 +#else
8398 +#define PREEMPT_LOCK_OFFSET    0
8399 +#endif
8400
8401  /*
8402   * The preempt_count offset needed for things like:
8403 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
8404  #define preempt_count_inc() preempt_count_add(1)
8405  #define preempt_count_dec() preempt_count_sub(1)
8406
8407 +#ifdef CONFIG_PREEMPT_LAZY
8408 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8409 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8410 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8411 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8412 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8413 +#else
8414 +#define add_preempt_lazy_count(val)    do { } while (0)
8415 +#define sub_preempt_lazy_count(val)    do { } while (0)
8416 +#define inc_preempt_lazy_count()       do { } while (0)
8417 +#define dec_preempt_lazy_count()       do { } while (0)
8418 +#define preempt_lazy_count()           (0)
8419 +#endif
8420 +
8421  #ifdef CONFIG_PREEMPT_COUNT
8422
8423  #define preempt_disable() \
8424 @@ -148,13 +175,25 @@ do { \
8425         barrier(); \
8426  } while (0)
8427
8428 +#define preempt_lazy_disable() \
8429 +do { \
8430 +       inc_preempt_lazy_count(); \
8431 +       barrier(); \
8432 +} while (0)
8433 +
8434  #define sched_preempt_enable_no_resched() \
8435  do { \
8436         barrier(); \
8437         preempt_count_dec(); \
8438  } while (0)
8439
8440 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8441 +#ifdef CONFIG_PREEMPT_RT_BASE
8442 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8443 +# define preempt_check_resched_rt() preempt_check_resched()
8444 +#else
8445 +# define preempt_enable_no_resched() preempt_enable()
8446 +# define preempt_check_resched_rt() barrier();
8447 +#endif
8448
8449  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8450
8451 @@ -179,6 +218,13 @@ do { \
8452                 __preempt_schedule(); \
8453  } while (0)
8454
8455 +#define preempt_lazy_enable() \
8456 +do { \
8457 +       dec_preempt_lazy_count(); \
8458 +       barrier(); \
8459 +       preempt_check_resched(); \
8460 +} while (0)
8461 +
8462  #else /* !CONFIG_PREEMPT */
8463  #define preempt_enable() \
8464  do { \
8465 @@ -224,6 +270,7 @@ do { \
8466  #define preempt_disable_notrace()              barrier()
8467  #define preempt_enable_no_resched_notrace()    barrier()
8468  #define preempt_enable_notrace()               barrier()
8469 +#define preempt_check_resched_rt()             barrier()
8470  #define preemptible()                          0
8471
8472  #endif /* CONFIG_PREEMPT_COUNT */
8473 @@ -244,10 +291,31 @@ do { \
8474  } while (0)
8475  #define preempt_fold_need_resched() \
8476  do { \
8477 -       if (tif_need_resched()) \
8478 +       if (tif_need_resched_now()) \
8479                 set_preempt_need_resched(); \
8480  } while (0)
8481
8482 +#ifdef CONFIG_PREEMPT_RT_FULL
8483 +# define preempt_disable_rt()          preempt_disable()
8484 +# define preempt_enable_rt()           preempt_enable()
8485 +# define preempt_disable_nort()                barrier()
8486 +# define preempt_enable_nort()         barrier()
8487 +# ifdef CONFIG_SMP
8488 +   extern void migrate_disable(void);
8489 +   extern void migrate_enable(void);
8490 +# else /* CONFIG_SMP */
8491 +#  define migrate_disable()            barrier()
8492 +#  define migrate_enable()             barrier()
8493 +# endif /* CONFIG_SMP */
8494 +#else
8495 +# define preempt_disable_rt()          barrier()
8496 +# define preempt_enable_rt()           barrier()
8497 +# define preempt_disable_nort()                preempt_disable()
8498 +# define preempt_enable_nort()         preempt_enable()
8499 +# define migrate_disable()             preempt_disable()
8500 +# define migrate_enable()              preempt_enable()
8501 +#endif
8502 +
8503  #ifdef CONFIG_PREEMPT_NOTIFIERS
8504
8505  struct preempt_notifier;
8506 diff --git a/include/linux/printk.h b/include/linux/printk.h
8507 index eac1af8502bb..37e647af0b0b 100644
8508 --- a/include/linux/printk.h
8509 +++ b/include/linux/printk.h
8510 @@ -126,9 +126,11 @@ struct va_format {
8511  #ifdef CONFIG_EARLY_PRINTK
8512  extern asmlinkage __printf(1, 2)
8513  void early_printk(const char *fmt, ...);
8514 +extern void printk_kill(void);
8515  #else
8516  static inline __printf(1, 2) __cold
8517  void early_printk(const char *s, ...) { }
8518 +static inline void printk_kill(void) { }
8519  #endif
8520
8521  #ifdef CONFIG_PRINTK_NMI
8522 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
8523 index af3581b8a451..277295039c8f 100644
8524 --- a/include/linux/radix-tree.h
8525 +++ b/include/linux/radix-tree.h
8526 @@ -292,6 +292,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
8527  int radix_tree_preload(gfp_t gfp_mask);
8528  int radix_tree_maybe_preload(gfp_t gfp_mask);
8529  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8530 +void radix_tree_preload_end(void);
8531 +
8532  void radix_tree_init(void);
8533  void *radix_tree_tag_set(struct radix_tree_root *root,
8534                         unsigned long index, unsigned int tag);
8535 @@ -314,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
8536  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
8537  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8538
8539 -static inline void radix_tree_preload_end(void)
8540 -{
8541 -       preempt_enable();
8542 -}
8543 -
8544  /**
8545   * struct radix_tree_iter - radix tree iterator state
8546   *
8547 diff --git a/include/linux/random.h b/include/linux/random.h
8548 index 16ab429735a7..9d0fecb5b6c2 100644
8549 --- a/include/linux/random.h
8550 +++ b/include/linux/random.h
8551 @@ -31,7 +31,7 @@ static inline void add_latent_entropy(void) {}
8552
8553  extern void add_input_randomness(unsigned int type, unsigned int code,
8554                                  unsigned int value) __latent_entropy;
8555 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8556 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8557
8558  extern void get_random_bytes(void *buf, int nbytes);
8559  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8560 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
8561 index e585018498d5..25c64474fc27 100644
8562 --- a/include/linux/rbtree.h
8563 +++ b/include/linux/rbtree.h
8564 @@ -31,7 +31,7 @@
8565
8566  #include <linux/kernel.h>
8567  #include <linux/stddef.h>
8568 -#include <linux/rcupdate.h>
8569 +#include <linux/rcu_assign_pointer.h>
8570
8571  struct rb_node {
8572         unsigned long  __rb_parent_color;
8573 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
8574 index d076183e49be..36bfb4dd57ae 100644
8575 --- a/include/linux/rbtree_augmented.h
8576 +++ b/include/linux/rbtree_augmented.h
8577 @@ -26,6 +26,7 @@
8578
8579  #include <linux/compiler.h>
8580  #include <linux/rbtree.h>
8581 +#include <linux/rcupdate.h>
8582
8583  /*
8584   * Please note - only struct rb_augment_callbacks and the prototypes for
8585 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
8586 new file mode 100644
8587 index 000000000000..7066962a4379
8588 --- /dev/null
8589 +++ b/include/linux/rcu_assign_pointer.h
8590 @@ -0,0 +1,54 @@
8591 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8592 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8593 +#include <linux/compiler.h>
8594 +#include <asm/barrier.h>
8595 +
8596 +/**
8597 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8598 + * @v: The value to statically initialize with.
8599 + */
8600 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8601 +
8602 +/**
8603 + * rcu_assign_pointer() - assign to RCU-protected pointer
8604 + * @p: pointer to assign to
8605 + * @v: value to assign (publish)
8606 + *
8607 + * Assigns the specified value to the specified RCU-protected
8608 + * pointer, ensuring that any concurrent RCU readers will see
8609 + * any prior initialization.
8610 + *
8611 + * Inserts memory barriers on architectures that require them
8612 + * (which is most of them), and also prevents the compiler from
8613 + * reordering the code that initializes the structure after the pointer
8614 + * assignment.  More importantly, this call documents which pointers
8615 + * will be dereferenced by RCU read-side code.
8616 + *
8617 + * In some special cases, you may use RCU_INIT_POINTER() instead
8618 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8619 + * to the fact that it does not constrain either the CPU or the compiler.
8620 + * That said, using RCU_INIT_POINTER() when you should have used
8621 + * rcu_assign_pointer() is a very bad thing that results in
8622 + * impossible-to-diagnose memory corruption.  So please be careful.
8623 + * See the RCU_INIT_POINTER() comment header for details.
8624 + *
8625 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8626 + * once, appearances notwithstanding.  One of the "extra" evaluations
8627 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8628 + * neither of which actually execute the argument.  As with most cpp
8629 + * macros, this execute-arguments-only-once property is important, so
8630 + * please be careful when making changes to rcu_assign_pointer() and the
8631 + * other macros that it invokes.
8632 + */
8633 +#define rcu_assign_pointer(p, v)                                             \
8634 +({                                                                           \
8635 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8636 +                                                                             \
8637 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8638 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8639 +       else                                                                  \
8640 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8641 +       _r_a_p__v;                                                            \
8642 +})
8643 +
8644 +#endif
8645 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
8646 index 01f71e1d2e94..30cc001d0d5a 100644
8647 --- a/include/linux/rcupdate.h
8648 +++ b/include/linux/rcupdate.h
8649 @@ -46,6 +46,7 @@
8650  #include <linux/compiler.h>
8651  #include <linux/ktime.h>
8652  #include <linux/irqflags.h>
8653 +#include <linux/rcu_assign_pointer.h>
8654
8655  #include <asm/barrier.h>
8656
8657 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
8658
8659  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8660
8661 +#ifdef CONFIG_PREEMPT_RT_FULL
8662 +#define call_rcu_bh    call_rcu
8663 +#else
8664  /**
8665   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8666   * @head: structure to be used for queueing the RCU updates.
8667 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
8668   */
8669  void call_rcu_bh(struct rcu_head *head,
8670                  rcu_callback_t func);
8671 +#endif
8672
8673  /**
8674   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8675 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
8676   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8677   */
8678  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8679 +#ifndef CONFIG_PREEMPT_RT_FULL
8680 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8681 +#else
8682 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8683 +#endif
8684
8685  #else /* #ifdef CONFIG_PREEMPT_RCU */
8686
8687 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
8688         return 0;
8689  }
8690
8691 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8692 +
8693  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8694
8695  /* Internal to kernel */
8696 @@ -505,7 +517,14 @@ extern struct lockdep_map rcu_callback_map;
8697  int debug_lockdep_rcu_enabled(void);
8698
8699  int rcu_read_lock_held(void);
8700 +#ifdef CONFIG_PREEMPT_RT_FULL
8701 +static inline int rcu_read_lock_bh_held(void)
8702 +{
8703 +       return rcu_read_lock_held();
8704 +}
8705 +#else
8706  int rcu_read_lock_bh_held(void);
8707 +#endif
8708
8709  /**
8710   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8711 @@ -626,54 +645,6 @@ static inline void rcu_preempt_sleep_check(void)
8712  })
8713
8714  /**
8715 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8716 - * @v: The value to statically initialize with.
8717 - */
8718 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8719 -
8720 -/**
8721 - * rcu_assign_pointer() - assign to RCU-protected pointer
8722 - * @p: pointer to assign to
8723 - * @v: value to assign (publish)
8724 - *
8725 - * Assigns the specified value to the specified RCU-protected
8726 - * pointer, ensuring that any concurrent RCU readers will see
8727 - * any prior initialization.
8728 - *
8729 - * Inserts memory barriers on architectures that require them
8730 - * (which is most of them), and also prevents the compiler from
8731 - * reordering the code that initializes the structure after the pointer
8732 - * assignment.  More importantly, this call documents which pointers
8733 - * will be dereferenced by RCU read-side code.
8734 - *
8735 - * In some special cases, you may use RCU_INIT_POINTER() instead
8736 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8737 - * to the fact that it does not constrain either the CPU or the compiler.
8738 - * That said, using RCU_INIT_POINTER() when you should have used
8739 - * rcu_assign_pointer() is a very bad thing that results in
8740 - * impossible-to-diagnose memory corruption.  So please be careful.
8741 - * See the RCU_INIT_POINTER() comment header for details.
8742 - *
8743 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8744 - * once, appearances notwithstanding.  One of the "extra" evaluations
8745 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8746 - * neither of which actually execute the argument.  As with most cpp
8747 - * macros, this execute-arguments-only-once property is important, so
8748 - * please be careful when making changes to rcu_assign_pointer() and the
8749 - * other macros that it invokes.
8750 - */
8751 -#define rcu_assign_pointer(p, v)                                             \
8752 -({                                                                           \
8753 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8754 -                                                                             \
8755 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8756 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8757 -       else                                                                  \
8758 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8759 -       _r_a_p__v;                                                            \
8760 -})
8761 -
8762 -/**
8763   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8764   * @p: The pointer to read
8765   *
8766 @@ -951,10 +922,14 @@ static inline void rcu_read_unlock(void)
8767  static inline void rcu_read_lock_bh(void)
8768  {
8769         local_bh_disable();
8770 +#ifdef CONFIG_PREEMPT_RT_FULL
8771 +       rcu_read_lock();
8772 +#else
8773         __acquire(RCU_BH);
8774         rcu_lock_acquire(&rcu_bh_lock_map);
8775         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8776                          "rcu_read_lock_bh() used illegally while idle");
8777 +#endif
8778  }
8779
8780  /*
8781 @@ -964,10 +939,14 @@ static inline void rcu_read_lock_bh(void)
8782   */
8783  static inline void rcu_read_unlock_bh(void)
8784  {
8785 +#ifdef CONFIG_PREEMPT_RT_FULL
8786 +       rcu_read_unlock();
8787 +#else
8788         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8789                          "rcu_read_unlock_bh() used illegally while idle");
8790         rcu_lock_release(&rcu_bh_lock_map);
8791         __release(RCU_BH);
8792 +#endif
8793         local_bh_enable();
8794  }
8795
8796 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
8797 index 63a4e4cf40a5..08ab12df2863 100644
8798 --- a/include/linux/rcutree.h
8799 +++ b/include/linux/rcutree.h
8800 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
8801         rcu_note_context_switch();
8802  }
8803
8804 +#ifdef CONFIG_PREEMPT_RT_FULL
8805 +# define synchronize_rcu_bh    synchronize_rcu
8806 +#else
8807  void synchronize_rcu_bh(void);
8808 +#endif
8809  void synchronize_sched_expedited(void);
8810  void synchronize_rcu_expedited(void);
8811
8812 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
8813  }
8814
8815  void rcu_barrier(void);
8816 +#ifdef CONFIG_PREEMPT_RT_FULL
8817 +# define rcu_barrier_bh                rcu_barrier
8818 +#else
8819  void rcu_barrier_bh(void);
8820 +#endif
8821  void rcu_barrier_sched(void);
8822  unsigned long get_state_synchronize_rcu(void);
8823  void cond_synchronize_rcu(unsigned long oldstate);
8824 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
8825  extern unsigned long rcutorture_testseq;
8826  extern unsigned long rcutorture_vernum;
8827  unsigned long rcu_batches_started(void);
8828 -unsigned long rcu_batches_started_bh(void);
8829  unsigned long rcu_batches_started_sched(void);
8830  unsigned long rcu_batches_completed(void);
8831 -unsigned long rcu_batches_completed_bh(void);
8832  unsigned long rcu_batches_completed_sched(void);
8833  unsigned long rcu_exp_batches_completed(void);
8834  unsigned long rcu_exp_batches_completed_sched(void);
8835  void show_rcu_gp_kthreads(void);
8836
8837  void rcu_force_quiescent_state(void);
8838 -void rcu_bh_force_quiescent_state(void);
8839  void rcu_sched_force_quiescent_state(void);
8840
8841  void rcu_idle_enter(void);
8842 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
8843
8844  bool rcu_is_watching(void);
8845
8846 +#ifndef CONFIG_PREEMPT_RT_FULL
8847 +void rcu_bh_force_quiescent_state(void);
8848 +unsigned long rcu_batches_started_bh(void);
8849 +unsigned long rcu_batches_completed_bh(void);
8850 +#else
8851 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
8852 +# define rcu_batches_completed_bh      rcu_batches_completed
8853 +# define rcu_batches_started_bh                rcu_batches_completed
8854 +#endif
8855 +
8856  void rcu_all_qs(void);
8857
8858  /* RCUtree hotplug events */
8859 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
8860 index 1abba5ce2a2f..294a8b4875f1 100644
8861 --- a/include/linux/rtmutex.h
8862 +++ b/include/linux/rtmutex.h
8863 @@ -13,11 +13,15 @@
8864  #define __LINUX_RT_MUTEX_H
8865
8866  #include <linux/linkage.h>
8867 +#include <linux/spinlock_types_raw.h>
8868  #include <linux/rbtree.h>
8869 -#include <linux/spinlock_types.h>
8870
8871  extern int max_lock_depth; /* for sysctl */
8872
8873 +#ifdef CONFIG_DEBUG_MUTEXES
8874 +#include <linux/debug_locks.h>
8875 +#endif
8876 +
8877  /**
8878   * The rt_mutex structure
8879   *
8880 @@ -31,8 +35,8 @@ struct rt_mutex {
8881         struct rb_root          waiters;
8882         struct rb_node          *waiters_leftmost;
8883         struct task_struct      *owner;
8884 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8885         int                     save_state;
8886 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8887         const char              *name, *file;
8888         int                     line;
8889         void                    *magic;
8890 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
8891  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
8892  #endif
8893
8894 +# define rt_mutex_init(mutex)                                  \
8895 +       do {                                                    \
8896 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
8897 +               __rt_mutex_init(mutex, #mutex);                 \
8898 +       } while (0)
8899 +
8900  #ifdef CONFIG_DEBUG_RT_MUTEXES
8901  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8902         , .name = #mutexname, .file = __FILE__, .line = __LINE__
8903 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
8904   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8905  #else
8906  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8907 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
8908  # define rt_mutex_debug_task_free(t)                   do { } while (0)
8909  #endif
8910
8911 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8912 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8913 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8914 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8915         , .waiters = RB_ROOT \
8916         , .owner = NULL \
8917 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8918 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8919 +
8920 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8921 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8922 +
8923 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8924 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
8925 +       , .save_state = 1 }
8926
8927  #define DEFINE_RT_MUTEX(mutexname) \
8928         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8929 @@ -90,7 +105,9 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
8930  extern void rt_mutex_destroy(struct rt_mutex *lock);
8931
8932  extern void rt_mutex_lock(struct rt_mutex *lock);
8933 +extern int rt_mutex_lock_state(struct rt_mutex *lock, int state);
8934  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8935 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8936  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8937                                struct hrtimer_sleeper *timeout);
8938
8939 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
8940 new file mode 100644
8941 index 000000000000..49ed2d45d3be
8942 --- /dev/null
8943 +++ b/include/linux/rwlock_rt.h
8944 @@ -0,0 +1,99 @@
8945 +#ifndef __LINUX_RWLOCK_RT_H
8946 +#define __LINUX_RWLOCK_RT_H
8947 +
8948 +#ifndef __LINUX_SPINLOCK_H
8949 +#error Do not include directly. Use spinlock.h
8950 +#endif
8951 +
8952 +#define rwlock_init(rwl)                               \
8953 +do {                                                   \
8954 +       static struct lock_class_key __key;             \
8955 +                                                       \
8956 +       rt_mutex_init(&(rwl)->lock);                    \
8957 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
8958 +} while (0)
8959 +
8960 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8961 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8962 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8963 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8964 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8965 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8966 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8967 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8968 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8969 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8970 +
8971 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
8972 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
8973 +
8974 +#define write_trylock_irqsave(lock, flags)     \
8975 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8976 +
8977 +#define read_lock_irqsave(lock, flags)                 \
8978 +       do {                                            \
8979 +               typecheck(unsigned long, flags);        \
8980 +               flags = rt_read_lock_irqsave(lock);     \
8981 +       } while (0)
8982 +
8983 +#define write_lock_irqsave(lock, flags)                        \
8984 +       do {                                            \
8985 +               typecheck(unsigned long, flags);        \
8986 +               flags = rt_write_lock_irqsave(lock);    \
8987 +       } while (0)
8988 +
8989 +#define read_lock(lock)                rt_read_lock(lock)
8990 +
8991 +#define read_lock_bh(lock)                             \
8992 +       do {                                            \
8993 +               local_bh_disable();                     \
8994 +               rt_read_lock(lock);                     \
8995 +       } while (0)
8996 +
8997 +#define read_lock_irq(lock)    read_lock(lock)
8998 +
8999 +#define write_lock(lock)       rt_write_lock(lock)
9000 +
9001 +#define write_lock_bh(lock)                            \
9002 +       do {                                            \
9003 +               local_bh_disable();                     \
9004 +               rt_write_lock(lock);                    \
9005 +       } while (0)
9006 +
9007 +#define write_lock_irq(lock)   write_lock(lock)
9008 +
9009 +#define read_unlock(lock)      rt_read_unlock(lock)
9010 +
9011 +#define read_unlock_bh(lock)                           \
9012 +       do {                                            \
9013 +               rt_read_unlock(lock);                   \
9014 +               local_bh_enable();                      \
9015 +       } while (0)
9016 +
9017 +#define read_unlock_irq(lock)  read_unlock(lock)
9018 +
9019 +#define write_unlock(lock)     rt_write_unlock(lock)
9020 +
9021 +#define write_unlock_bh(lock)                          \
9022 +       do {                                            \
9023 +               rt_write_unlock(lock);                  \
9024 +               local_bh_enable();                      \
9025 +       } while (0)
9026 +
9027 +#define write_unlock_irq(lock) write_unlock(lock)
9028 +
9029 +#define read_unlock_irqrestore(lock, flags)            \
9030 +       do {                                            \
9031 +               typecheck(unsigned long, flags);        \
9032 +               (void) flags;                           \
9033 +               rt_read_unlock(lock);                   \
9034 +       } while (0)
9035 +
9036 +#define write_unlock_irqrestore(lock, flags) \
9037 +       do {                                            \
9038 +               typecheck(unsigned long, flags);        \
9039 +               (void) flags;                           \
9040 +               rt_write_unlock(lock);                  \
9041 +       } while (0)
9042 +
9043 +#endif
9044 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
9045 index cc0072e93e36..5317cd957292 100644
9046 --- a/include/linux/rwlock_types.h
9047 +++ b/include/linux/rwlock_types.h
9048 @@ -1,6 +1,10 @@
9049  #ifndef __LINUX_RWLOCK_TYPES_H
9050  #define __LINUX_RWLOCK_TYPES_H
9051
9052 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
9053 +# error "Do not include directly, include spinlock_types.h"
9054 +#endif
9055 +
9056  /*
9057   * include/linux/rwlock_types.h - generic rwlock type definitions
9058   *                               and initializers
9059 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
9060 new file mode 100644
9061 index 000000000000..51b28d775fe1
9062 --- /dev/null
9063 +++ b/include/linux/rwlock_types_rt.h
9064 @@ -0,0 +1,33 @@
9065 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
9066 +#define __LINUX_RWLOCK_TYPES_RT_H
9067 +
9068 +#ifndef __LINUX_SPINLOCK_TYPES_H
9069 +#error "Do not include directly. Include spinlock_types.h instead"
9070 +#endif
9071 +
9072 +/*
9073 + * rwlocks - rtmutex which allows single reader recursion
9074 + */
9075 +typedef struct {
9076 +       struct rt_mutex         lock;
9077 +       int                     read_depth;
9078 +       unsigned int            break_lock;
9079 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9080 +       struct lockdep_map      dep_map;
9081 +#endif
9082 +} rwlock_t;
9083 +
9084 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9085 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
9086 +#else
9087 +# define RW_DEP_MAP_INIT(lockname)
9088 +#endif
9089 +
9090 +#define __RW_LOCK_UNLOCKED(name) \
9091 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
9092 +         RW_DEP_MAP_INIT(name) }
9093 +
9094 +#define DEFINE_RWLOCK(name) \
9095 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
9096 +
9097 +#endif
9098 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
9099 index dd1d14250340..aa2ac1f65c2d 100644
9100 --- a/include/linux/rwsem.h
9101 +++ b/include/linux/rwsem.h
9102 @@ -19,6 +19,10 @@
9103  #include <linux/osq_lock.h>
9104  #endif
9105
9106 +#ifdef CONFIG_PREEMPT_RT_FULL
9107 +#include <linux/rwsem_rt.h>
9108 +#else /* PREEMPT_RT_FULL */
9109 +
9110  struct rw_semaphore;
9111
9112  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
9113 @@ -106,6 +110,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
9114         return !list_empty(&sem->wait_list);
9115  }
9116
9117 +#endif /* !PREEMPT_RT_FULL */
9118 +
9119 +/*
9120 + * The functions below are the same for all rwsem implementations including
9121 + * the RT specific variant.
9122 + */
9123 +
9124  /*
9125   * lock for reading
9126   */
9127 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
9128 new file mode 100644
9129 index 000000000000..2ffbf093ae92
9130 --- /dev/null
9131 +++ b/include/linux/rwsem_rt.h
9132 @@ -0,0 +1,67 @@
9133 +#ifndef _LINUX_RWSEM_RT_H
9134 +#define _LINUX_RWSEM_RT_H
9135 +
9136 +#ifndef _LINUX_RWSEM_H
9137 +#error "Include rwsem.h"
9138 +#endif
9139 +
9140 +#include <linux/rtmutex.h>
9141 +#include <linux/swait.h>
9142 +
9143 +#define READER_BIAS            (1U << 31)
9144 +#define WRITER_BIAS            (1U << 30)
9145 +
9146 +struct rw_semaphore {
9147 +       atomic_t                readers;
9148 +       struct rt_mutex         rtmutex;
9149 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9150 +       struct lockdep_map      dep_map;
9151 +#endif
9152 +};
9153 +
9154 +#define __RWSEM_INITIALIZER(name)                              \
9155 +{                                                              \
9156 +       .readers = ATOMIC_INIT(READER_BIAS),                    \
9157 +       .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex),        \
9158 +       RW_DEP_MAP_INIT(name)                                   \
9159 +}
9160 +
9161 +#define DECLARE_RWSEM(lockname) \
9162 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
9163 +
9164 +extern void  __rwsem_init(struct rw_semaphore *rwsem, const char *name,
9165 +                         struct lock_class_key *key);
9166 +
9167 +#define __init_rwsem(sem, name, key)                   \
9168 +do {                                                   \
9169 +               rt_mutex_init(&(sem)->rtmutex);         \
9170 +               __rwsem_init((sem), (name), (key));     \
9171 +} while (0)
9172 +
9173 +#define init_rwsem(sem)                                        \
9174 +do {                                                   \
9175 +       static struct lock_class_key __key;             \
9176 +                                                       \
9177 +       __init_rwsem((sem), #sem, &__key);              \
9178 +} while (0)
9179 +
9180 +static inline int rwsem_is_locked(struct rw_semaphore *sem)
9181 +{
9182 +       return atomic_read(&sem->readers) != READER_BIAS;
9183 +}
9184 +
9185 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
9186 +{
9187 +       return atomic_read(&sem->readers) > 0;
9188 +}
9189 +
9190 +extern void __down_read(struct rw_semaphore *sem);
9191 +extern int __down_read_trylock(struct rw_semaphore *sem);
9192 +extern void __down_write(struct rw_semaphore *sem);
9193 +extern int __must_check __down_write_killable(struct rw_semaphore *sem);
9194 +extern int __down_write_trylock(struct rw_semaphore *sem);
9195 +extern void __up_read(struct rw_semaphore *sem);
9196 +extern void __up_write(struct rw_semaphore *sem);
9197 +extern void __downgrade_write(struct rw_semaphore *sem);
9198 +
9199 +#endif
9200 diff --git a/include/linux/sched.h b/include/linux/sched.h
9201 index f425eb3318ab..e010fb4d640d 100644
9202 --- a/include/linux/sched.h
9203 +++ b/include/linux/sched.h
9204 @@ -26,6 +26,7 @@ struct sched_param {
9205  #include <linux/nodemask.h>
9206  #include <linux/mm_types.h>
9207  #include <linux/preempt.h>
9208 +#include <asm/kmap_types.h>
9209
9210  #include <asm/page.h>
9211  #include <asm/ptrace.h>
9212 @@ -243,10 +244,7 @@ extern char ___assert_task_state[1 - 2*!!(
9213                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
9214                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
9215
9216 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
9217  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
9218 -#define task_is_stopped_or_traced(task)        \
9219 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
9220  #define task_contributes_to_load(task) \
9221                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
9222                                  (task->flags & PF_FROZEN) == 0 && \
9223 @@ -312,6 +310,11 @@ extern char ___assert_task_state[1 - 2*!!(
9224
9225  #endif
9226
9227 +#define __set_current_state_no_track(state_value)      \
9228 +       do { current->state = (state_value); } while (0)
9229 +#define set_current_state_no_track(state_value)                \
9230 +       set_mb(current->state, (state_value))
9231 +
9232  /* Task command name length */
9233  #define TASK_COMM_LEN 16
9234
9235 @@ -1013,8 +1016,18 @@ struct wake_q_head {
9236         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
9237
9238  extern void wake_q_add(struct wake_q_head *head,
9239 -                      struct task_struct *task);
9240 -extern void wake_up_q(struct wake_q_head *head);
9241 +                             struct task_struct *task);
9242 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
9243 +
9244 +static inline void wake_up_q(struct wake_q_head *head)
9245 +{
9246 +       __wake_up_q(head, false);
9247 +}
9248 +
9249 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
9250 +{
9251 +       __wake_up_q(head, true);
9252 +}
9253
9254  /*
9255   * sched-domains (multiprocessor balancing) declarations:
9256 @@ -1481,6 +1494,7 @@ struct task_struct {
9257         struct thread_info thread_info;
9258  #endif
9259         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9260 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
9261         void *stack;
9262         atomic_t usage;
9263         unsigned int flags;     /* per process flags, defined below */
9264 @@ -1520,6 +1534,12 @@ struct task_struct {
9265  #endif
9266
9267         unsigned int policy;
9268 +#ifdef CONFIG_PREEMPT_RT_FULL
9269 +       int migrate_disable;
9270 +# ifdef CONFIG_SCHED_DEBUG
9271 +       int migrate_disable_atomic;
9272 +# endif
9273 +#endif
9274         int nr_cpus_allowed;
9275         cpumask_t cpus_allowed;
9276
9277 @@ -1658,6 +1678,9 @@ struct task_struct {
9278
9279         struct task_cputime cputime_expires;
9280         struct list_head cpu_timers[3];
9281 +#ifdef CONFIG_PREEMPT_RT_BASE
9282 +       struct task_struct *posix_timer_list;
9283 +#endif
9284
9285  /* process credentials */
9286         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
9287 @@ -1689,10 +1712,15 @@ struct task_struct {
9288  /* signal handlers */
9289         struct signal_struct *signal;
9290         struct sighand_struct *sighand;
9291 +       struct sigqueue *sigqueue_cache;
9292
9293         sigset_t blocked, real_blocked;
9294         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9295         struct sigpending pending;
9296 +#ifdef CONFIG_PREEMPT_RT_FULL
9297 +       /* TODO: move me into ->restart_block ? */
9298 +       struct siginfo forced_info;
9299 +#endif
9300
9301         unsigned long sas_ss_sp;
9302         size_t sas_ss_size;
9303 @@ -1723,6 +1751,8 @@ struct task_struct {
9304         /* PI waiters blocked on a rt_mutex held by this task */
9305         struct rb_root pi_waiters;
9306         struct rb_node *pi_waiters_leftmost;
9307 +       /* Updated under owner's pi_lock and rq lock */
9308 +       struct task_struct      *pi_top_task;
9309         /* Deadlock detection and priority inheritance handling */
9310         struct rt_mutex_waiter *pi_blocked_on;
9311  #endif
9312 @@ -1921,6 +1951,12 @@ struct task_struct {
9313         /* bitmask and counter of trace recursion */
9314         unsigned long trace_recursion;
9315  #endif /* CONFIG_TRACING */
9316 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
9317 +       u64 preempt_timestamp_hist;
9318 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
9319 +       long timer_offset;
9320 +#endif
9321 +#endif
9322  #ifdef CONFIG_KCOV
9323         /* Coverage collection mode enabled for this task (0 if disabled). */
9324         enum kcov_mode kcov_mode;
9325 @@ -1946,9 +1982,23 @@ struct task_struct {
9326         unsigned int    sequential_io;
9327         unsigned int    sequential_io_avg;
9328  #endif
9329 +#ifdef CONFIG_PREEMPT_RT_BASE
9330 +       struct rcu_head put_rcu;
9331 +       int softirq_nestcnt;
9332 +       unsigned int softirqs_raised;
9333 +#endif
9334 +#ifdef CONFIG_PREEMPT_RT_FULL
9335 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9336 +       int kmap_idx;
9337 +       pte_t kmap_pte[KM_TYPE_NR];
9338 +# endif
9339 +#endif
9340  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9341         unsigned long   task_state_change;
9342  #endif
9343 +#ifdef CONFIG_PREEMPT_RT_FULL
9344 +       int xmit_recursion;
9345 +#endif
9346         int pagefault_disabled;
9347  #ifdef CONFIG_MMU
9348         struct task_struct *oom_reaper_list;
9349 @@ -1988,14 +2038,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
9350  }
9351  #endif
9352
9353 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9354 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9355 -
9356 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9357 -{
9358 -       return p->nr_cpus_allowed;
9359 -}
9360 -
9361  #define TNF_MIGRATED   0x01
9362  #define TNF_NO_GROUP   0x02
9363  #define TNF_SHARED     0x04
9364 @@ -2211,6 +2253,15 @@ extern struct pid *cad_pid;
9365  extern void free_task(struct task_struct *tsk);
9366  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9367
9368 +#ifdef CONFIG_PREEMPT_RT_BASE
9369 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9370 +
9371 +static inline void put_task_struct(struct task_struct *t)
9372 +{
9373 +       if (atomic_dec_and_test(&t->usage))
9374 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9375 +}
9376 +#else
9377  extern void __put_task_struct(struct task_struct *t);
9378
9379  static inline void put_task_struct(struct task_struct *t)
9380 @@ -2218,6 +2269,7 @@ static inline void put_task_struct(struct task_struct *t)
9381         if (atomic_dec_and_test(&t->usage))
9382                 __put_task_struct(t);
9383  }
9384 +#endif
9385
9386  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9387  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9388 @@ -2259,6 +2311,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
9389  /*
9390   * Per process flags
9391   */
9392 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9393  #define PF_EXITING     0x00000004      /* getting shut down */
9394  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9395  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9396 @@ -2427,6 +2480,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
9397
9398  extern int set_cpus_allowed_ptr(struct task_struct *p,
9399                                 const struct cpumask *new_mask);
9400 +int migrate_me(void);
9401 +void tell_sched_cpu_down_begin(int cpu);
9402 +void tell_sched_cpu_down_done(int cpu);
9403 +
9404  #else
9405  static inline void do_set_cpus_allowed(struct task_struct *p,
9406                                       const struct cpumask *new_mask)
9407 @@ -2439,6 +2496,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
9408                 return -EINVAL;
9409         return 0;
9410  }
9411 +static inline int migrate_me(void) { return 0; }
9412 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9413 +static inline void tell_sched_cpu_down_done(int cpu) { }
9414  #endif
9415
9416  #ifdef CONFIG_NO_HZ_COMMON
9417 @@ -2677,6 +2737,7 @@ extern void xtime_update(unsigned long ticks);
9418
9419  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9420  extern int wake_up_process(struct task_struct *tsk);
9421 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9422  extern void wake_up_new_task(struct task_struct *tsk);
9423  #ifdef CONFIG_SMP
9424   extern void kick_process(struct task_struct *tsk);
9425 @@ -2885,6 +2946,17 @@ static inline void mmdrop(struct mm_struct *mm)
9426                 __mmdrop(mm);
9427  }
9428
9429 +#ifdef CONFIG_PREEMPT_RT_BASE
9430 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9431 +static inline void mmdrop_delayed(struct mm_struct *mm)
9432 +{
9433 +       if (atomic_dec_and_test(&mm->mm_count))
9434 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9435 +}
9436 +#else
9437 +# define mmdrop_delayed(mm)    mmdrop(mm)
9438 +#endif
9439 +
9440  static inline void mmdrop_async_fn(struct work_struct *work)
9441  {
9442         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9443 @@ -3277,6 +3349,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
9444         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9445  }
9446
9447 +#ifdef CONFIG_PREEMPT_LAZY
9448 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9449 +{
9450 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9451 +}
9452 +
9453 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9454 +{
9455 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9456 +}
9457 +
9458 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9459 +{
9460 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9461 +}
9462 +
9463 +static inline int need_resched_lazy(void)
9464 +{
9465 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9466 +}
9467 +
9468 +static inline int need_resched_now(void)
9469 +{
9470 +       return test_thread_flag(TIF_NEED_RESCHED);
9471 +}
9472 +
9473 +#else
9474 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9475 +static inline int need_resched_lazy(void) { return 0; }
9476 +
9477 +static inline int need_resched_now(void)
9478 +{
9479 +       return test_thread_flag(TIF_NEED_RESCHED);
9480 +}
9481 +
9482 +#endif
9483 +
9484  static inline int restart_syscall(void)
9485  {
9486         set_tsk_thread_flag(current, TIF_SIGPENDING);
9487 @@ -3308,6 +3417,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
9488         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9489  }
9490
9491 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9492 +{
9493 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9494 +               return true;
9495 +#ifdef CONFIG_PREEMPT_RT_FULL
9496 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9497 +               return true;
9498 +#endif
9499 +       return false;
9500 +}
9501 +
9502 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9503 +{
9504 +       bool traced_stopped;
9505 +
9506 +#ifdef CONFIG_PREEMPT_RT_FULL
9507 +       unsigned long flags;
9508 +
9509 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9510 +       traced_stopped = __task_is_stopped_or_traced(task);
9511 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9512 +#else
9513 +       traced_stopped = __task_is_stopped_or_traced(task);
9514 +#endif
9515 +       return traced_stopped;
9516 +}
9517 +
9518 +static inline bool task_is_traced(struct task_struct *task)
9519 +{
9520 +       bool traced = false;
9521 +
9522 +       if (task->state & __TASK_TRACED)
9523 +               return true;
9524 +#ifdef CONFIG_PREEMPT_RT_FULL
9525 +       /* in case the task is sleeping on tasklist_lock */
9526 +       raw_spin_lock_irq(&task->pi_lock);
9527 +       if (task->state & __TASK_TRACED)
9528 +               traced = true;
9529 +       else if (task->saved_state & __TASK_TRACED)
9530 +               traced = true;
9531 +       raw_spin_unlock_irq(&task->pi_lock);
9532 +#endif
9533 +       return traced;
9534 +}
9535 +
9536  /*
9537   * cond_resched() and cond_resched_lock(): latency reduction via
9538   * explicit rescheduling in places that are safe. The return
9539 @@ -3333,12 +3487,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
9540         __cond_resched_lock(lock);                              \
9541  })
9542
9543 +#ifndef CONFIG_PREEMPT_RT_FULL
9544  extern int __cond_resched_softirq(void);
9545
9546  #define cond_resched_softirq() ({                                      \
9547         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9548         __cond_resched_softirq();                                       \
9549  })
9550 +#else
9551 +# define cond_resched_softirq()                cond_resched()
9552 +#endif
9553
9554  static inline void cond_resched_rcu(void)
9555  {
9556 @@ -3513,6 +3671,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
9557
9558  #endif /* CONFIG_SMP */
9559
9560 +static inline int __migrate_disabled(struct task_struct *p)
9561 +{
9562 +#ifdef CONFIG_PREEMPT_RT_FULL
9563 +       return p->migrate_disable;
9564 +#else
9565 +       return 0;
9566 +#endif
9567 +}
9568 +
9569 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9570 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9571 +{
9572 +       if (__migrate_disabled(p))
9573 +               return cpumask_of(task_cpu(p));
9574 +
9575 +       return &p->cpus_allowed;
9576 +}
9577 +
9578 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9579 +{
9580 +       if (__migrate_disabled(p))
9581 +               return 1;
9582 +       return p->nr_cpus_allowed;
9583 +}
9584 +
9585  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9586  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9587
9588 diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
9589 index a30b172df6e1..db3e91f2bc03 100644
9590 --- a/include/linux/sched/rt.h
9591 +++ b/include/linux/sched/rt.h
9592 @@ -16,27 +16,20 @@ static inline int rt_task(struct task_struct *p)
9593  }
9594
9595  #ifdef CONFIG_RT_MUTEXES
9596 -extern int rt_mutex_getprio(struct task_struct *p);
9597 -extern void rt_mutex_setprio(struct task_struct *p, int prio);
9598 -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
9599 -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
9600 +/*
9601 + * Must hold either p->pi_lock or task_rq(p)->lock.
9602 + */
9603 +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
9604 +{
9605 +       return p->pi_top_task;
9606 +}
9607 +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
9608  extern void rt_mutex_adjust_pi(struct task_struct *p);
9609  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
9610  {
9611         return tsk->pi_blocked_on != NULL;
9612  }
9613  #else
9614 -static inline int rt_mutex_getprio(struct task_struct *p)
9615 -{
9616 -       return p->normal_prio;
9617 -}
9618 -
9619 -static inline int rt_mutex_get_effective_prio(struct task_struct *task,
9620 -                                             int newprio)
9621 -{
9622 -       return newprio;
9623 -}
9624 -
9625  static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
9626  {
9627         return NULL;
9628 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
9629 index ead97654c4e9..3d7223ffdd3b 100644
9630 --- a/include/linux/seqlock.h
9631 +++ b/include/linux/seqlock.h
9632 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
9633         return __read_seqcount_retry(s, start);
9634  }
9635
9636 -
9637 -
9638 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9639 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9640  {
9641         s->sequence++;
9642         smp_wmb();
9643  }
9644
9645 -static inline void raw_write_seqcount_end(seqcount_t *s)
9646 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9647 +{
9648 +       preempt_disable_rt();
9649 +       __raw_write_seqcount_begin(s);
9650 +}
9651 +
9652 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9653  {
9654         smp_wmb();
9655         s->sequence++;
9656  }
9657
9658 +static inline void raw_write_seqcount_end(seqcount_t *s)
9659 +{
9660 +       __raw_write_seqcount_end(s);
9661 +       preempt_enable_rt();
9662 +}
9663 +
9664  /**
9665   * raw_write_seqcount_barrier - do a seq write barrier
9666   * @s: pointer to seqcount_t
9667 @@ -428,10 +438,32 @@ typedef struct {
9668  /*
9669   * Read side functions for starting and finalizing a read side section.
9670   */
9671 +#ifndef CONFIG_PREEMPT_RT_FULL
9672  static inline unsigned read_seqbegin(const seqlock_t *sl)
9673  {
9674         return read_seqcount_begin(&sl->seqcount);
9675  }
9676 +#else
9677 +/*
9678 + * Starvation safe read side for RT
9679 + */
9680 +static inline unsigned read_seqbegin(seqlock_t *sl)
9681 +{
9682 +       unsigned ret;
9683 +
9684 +repeat:
9685 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9686 +       if (unlikely(ret & 1)) {
9687 +               /*
9688 +                * Take the lock and let the writer proceed (i.e. evtl
9689 +                * boost it), otherwise we could loop here forever.
9690 +                */
9691 +               spin_unlock_wait(&sl->lock);
9692 +               goto repeat;
9693 +       }
9694 +       return ret;
9695 +}
9696 +#endif
9697
9698  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9699  {
9700 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9701  static inline void write_seqlock(seqlock_t *sl)
9702  {
9703         spin_lock(&sl->lock);
9704 -       write_seqcount_begin(&sl->seqcount);
9705 +       __raw_write_seqcount_begin(&sl->seqcount);
9706 +}
9707 +
9708 +static inline int try_write_seqlock(seqlock_t *sl)
9709 +{
9710 +       if (spin_trylock(&sl->lock)) {
9711 +               __raw_write_seqcount_begin(&sl->seqcount);
9712 +               return 1;
9713 +       }
9714 +       return 0;
9715  }
9716
9717  static inline void write_sequnlock(seqlock_t *sl)
9718  {
9719 -       write_seqcount_end(&sl->seqcount);
9720 +       __raw_write_seqcount_end(&sl->seqcount);
9721         spin_unlock(&sl->lock);
9722  }
9723
9724  static inline void write_seqlock_bh(seqlock_t *sl)
9725  {
9726         spin_lock_bh(&sl->lock);
9727 -       write_seqcount_begin(&sl->seqcount);
9728 +       __raw_write_seqcount_begin(&sl->seqcount);
9729  }
9730
9731  static inline void write_sequnlock_bh(seqlock_t *sl)
9732  {
9733 -       write_seqcount_end(&sl->seqcount);
9734 +       __raw_write_seqcount_end(&sl->seqcount);
9735         spin_unlock_bh(&sl->lock);
9736  }
9737
9738  static inline void write_seqlock_irq(seqlock_t *sl)
9739  {
9740         spin_lock_irq(&sl->lock);
9741 -       write_seqcount_begin(&sl->seqcount);
9742 +       __raw_write_seqcount_begin(&sl->seqcount);
9743  }
9744
9745  static inline void write_sequnlock_irq(seqlock_t *sl)
9746  {
9747 -       write_seqcount_end(&sl->seqcount);
9748 +       __raw_write_seqcount_end(&sl->seqcount);
9749         spin_unlock_irq(&sl->lock);
9750  }
9751
9752 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9753         unsigned long flags;
9754
9755         spin_lock_irqsave(&sl->lock, flags);
9756 -       write_seqcount_begin(&sl->seqcount);
9757 +       __raw_write_seqcount_begin(&sl->seqcount);
9758         return flags;
9759  }
9760
9761 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9762  static inline void
9763  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9764  {
9765 -       write_seqcount_end(&sl->seqcount);
9766 +       __raw_write_seqcount_end(&sl->seqcount);
9767         spin_unlock_irqrestore(&sl->lock, flags);
9768  }
9769
9770 diff --git a/include/linux/signal.h b/include/linux/signal.h
9771 index b63f63eaa39c..295540fdfc72 100644
9772 --- a/include/linux/signal.h
9773 +++ b/include/linux/signal.h
9774 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
9775  }
9776
9777  extern void flush_sigqueue(struct sigpending *queue);
9778 +extern void flush_task_sigqueue(struct task_struct *tsk);
9779
9780  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9781  static inline int valid_signal(unsigned long sig)
9782 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
9783 index 32810f279f8e..0db6e31161f6 100644
9784 --- a/include/linux/skbuff.h
9785 +++ b/include/linux/skbuff.h
9786 @@ -284,6 +284,7 @@ struct sk_buff_head {
9787
9788         __u32           qlen;
9789         spinlock_t      lock;
9790 +       raw_spinlock_t  raw_lock;
9791  };
9792
9793  struct sk_buff;
9794 @@ -1573,6 +1574,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
9795         __skb_queue_head_init(list);
9796  }
9797
9798 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9799 +{
9800 +       raw_spin_lock_init(&list->raw_lock);
9801 +       __skb_queue_head_init(list);
9802 +}
9803 +
9804  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9805                 struct lock_class_key *class)
9806  {
9807 diff --git a/include/linux/smp.h b/include/linux/smp.h
9808 index 8e0cb7a0f836..891c533724f5 100644
9809 --- a/include/linux/smp.h
9810 +++ b/include/linux/smp.h
9811 @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
9812  extern void __init setup_nr_cpu_ids(void);
9813  extern void __init smp_init(void);
9814
9815 +extern int __boot_cpu_id;
9816 +
9817 +static inline int get_boot_cpu_id(void)
9818 +{
9819 +       return __boot_cpu_id;
9820 +}
9821 +
9822  #else /* !SMP */
9823
9824  static inline void smp_send_stop(void) { }
9825 @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
9826  static inline void smp_init(void) { }
9827  #endif
9828
9829 +static inline int get_boot_cpu_id(void)
9830 +{
9831 +       return 0;
9832 +}
9833 +
9834  #endif /* !SMP */
9835
9836  /*
9837 @@ -185,6 +197,9 @@ static inline void smp_init(void) { }
9838  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
9839  #define put_cpu()              preempt_enable()
9840
9841 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
9842 +#define put_cpu_light()                migrate_enable()
9843 +
9844  /*
9845   * Callback to arch code if there's nosmp or maxcpus=0 on the
9846   * boot command line:
9847 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
9848 index 47dd0cebd204..b241cc044bd3 100644
9849 --- a/include/linux/spinlock.h
9850 +++ b/include/linux/spinlock.h
9851 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9852  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
9853
9854  /* Include rwlock functions */
9855 -#include <linux/rwlock.h>
9856 +#ifdef CONFIG_PREEMPT_RT_FULL
9857 +# include <linux/rwlock_rt.h>
9858 +#else
9859 +# include <linux/rwlock.h>
9860 +#endif
9861
9862  /*
9863   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9864 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9865  # include <linux/spinlock_api_up.h>
9866  #endif
9867
9868 +#ifdef CONFIG_PREEMPT_RT_FULL
9869 +# include <linux/spinlock_rt.h>
9870 +#else /* PREEMPT_RT_FULL */
9871 +
9872  /*
9873   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9874   */
9875 @@ -416,4 +424,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
9876  #define atomic_dec_and_lock(atomic, lock) \
9877                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9878
9879 +#endif /* !PREEMPT_RT_FULL */
9880 +
9881  #endif /* __LINUX_SPINLOCK_H */
9882 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
9883 index 5344268e6e62..043263f30e81 100644
9884 --- a/include/linux/spinlock_api_smp.h
9885 +++ b/include/linux/spinlock_api_smp.h
9886 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
9887         return 0;
9888  }
9889
9890 -#include <linux/rwlock_api_smp.h>
9891 +#ifndef CONFIG_PREEMPT_RT_FULL
9892 +# include <linux/rwlock_api_smp.h>
9893 +#endif
9894
9895  #endif /* __LINUX_SPINLOCK_API_SMP_H */
9896 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
9897 new file mode 100644
9898 index 000000000000..43ca841b913a
9899 --- /dev/null
9900 +++ b/include/linux/spinlock_rt.h
9901 @@ -0,0 +1,162 @@
9902 +#ifndef __LINUX_SPINLOCK_RT_H
9903 +#define __LINUX_SPINLOCK_RT_H
9904 +
9905 +#ifndef __LINUX_SPINLOCK_H
9906 +#error Do not include directly. Use spinlock.h
9907 +#endif
9908 +
9909 +#include <linux/bug.h>
9910 +
9911 +extern void
9912 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9913 +
9914 +#define spin_lock_init(slock)                          \
9915 +do {                                                   \
9916 +       static struct lock_class_key __key;             \
9917 +                                                       \
9918 +       rt_mutex_init(&(slock)->lock);                  \
9919 +       __rt_spin_lock_init(slock, #slock, &__key);     \
9920 +} while (0)
9921 +
9922 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9923 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9924 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9925 +
9926 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9927 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9928 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9929 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9930 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9931 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9932 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9933 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9934 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9935 +
9936 +/*
9937 + * lockdep-less calls, for derived types like rwlock:
9938 + * (for trylock they can use rt_mutex_trylock() directly.
9939 + */
9940 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9941 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9942 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9943 +
9944 +#define spin_lock(lock)                        rt_spin_lock(lock)
9945 +
9946 +#define spin_lock_bh(lock)                     \
9947 +       do {                                    \
9948 +               local_bh_disable();             \
9949 +               rt_spin_lock(lock);             \
9950 +       } while (0)
9951 +
9952 +#define spin_lock_irq(lock)            spin_lock(lock)
9953 +
9954 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
9955 +
9956 +#define spin_trylock(lock)                     \
9957 +({                                             \
9958 +       int __locked;                           \
9959 +       __locked = spin_do_trylock(lock);       \
9960 +       __locked;                               \
9961 +})
9962 +
9963 +#ifdef CONFIG_LOCKDEP
9964 +# define spin_lock_nested(lock, subclass)              \
9965 +       do {                                            \
9966 +               rt_spin_lock_nested(lock, subclass);    \
9967 +       } while (0)
9968 +
9969 +#define spin_lock_bh_nested(lock, subclass)            \
9970 +       do {                                            \
9971 +               local_bh_disable();                     \
9972 +               rt_spin_lock_nested(lock, subclass);    \
9973 +       } while (0)
9974 +
9975 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9976 +       do {                                             \
9977 +               typecheck(unsigned long, flags);         \
9978 +               flags = 0;                               \
9979 +               rt_spin_lock_nested(lock, subclass);     \
9980 +       } while (0)
9981 +#else
9982 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
9983 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
9984 +
9985 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9986 +       do {                                             \
9987 +               typecheck(unsigned long, flags);         \
9988 +               flags = 0;                               \
9989 +               spin_lock(lock);                         \
9990 +       } while (0)
9991 +#endif
9992 +
9993 +#define spin_lock_irqsave(lock, flags)                  \
9994 +       do {                                             \
9995 +               typecheck(unsigned long, flags);         \
9996 +               flags = 0;                               \
9997 +               spin_lock(lock);                         \
9998 +       } while (0)
9999 +
10000 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
10001 +{
10002 +       unsigned long flags = 0;
10003 +#ifdef CONFIG_TRACE_IRQFLAGS
10004 +       flags = rt_spin_lock_trace_flags(lock);
10005 +#else
10006 +       spin_lock(lock); /* lock_local */
10007 +#endif
10008 +       return flags;
10009 +}
10010 +
10011 +/* FIXME: we need rt_spin_lock_nest_lock */
10012 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
10013 +
10014 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
10015 +
10016 +#define spin_unlock_bh(lock)                           \
10017 +       do {                                            \
10018 +               rt_spin_unlock(lock);                   \
10019 +               local_bh_enable();                      \
10020 +       } while (0)
10021 +
10022 +#define spin_unlock_irq(lock)          spin_unlock(lock)
10023 +
10024 +#define spin_unlock_irqrestore(lock, flags)            \
10025 +       do {                                            \
10026 +               typecheck(unsigned long, flags);        \
10027 +               (void) flags;                           \
10028 +               spin_unlock(lock);                      \
10029 +       } while (0)
10030 +
10031 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
10032 +#define spin_trylock_irq(lock) spin_trylock(lock)
10033 +
10034 +#define spin_trylock_irqsave(lock, flags)      \
10035 +       rt_spin_trylock_irqsave(lock, &(flags))
10036 +
10037 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
10038 +
10039 +#ifdef CONFIG_GENERIC_LOCKBREAK
10040 +# define spin_is_contended(lock)       ((lock)->break_lock)
10041 +#else
10042 +# define spin_is_contended(lock)       (((void)(lock), 0))
10043 +#endif
10044 +
10045 +static inline int spin_can_lock(spinlock_t *lock)
10046 +{
10047 +       return !rt_mutex_is_locked(&lock->lock);
10048 +}
10049 +
10050 +static inline int spin_is_locked(spinlock_t *lock)
10051 +{
10052 +       return rt_mutex_is_locked(&lock->lock);
10053 +}
10054 +
10055 +static inline void assert_spin_locked(spinlock_t *lock)
10056 +{
10057 +       BUG_ON(!spin_is_locked(lock));
10058 +}
10059 +
10060 +#define atomic_dec_and_lock(atomic, lock) \
10061 +       atomic_dec_and_spin_lock(atomic, lock)
10062 +
10063 +#endif
10064 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
10065 index 73548eb13a5d..10bac715ea96 100644
10066 --- a/include/linux/spinlock_types.h
10067 +++ b/include/linux/spinlock_types.h
10068 @@ -9,80 +9,15 @@
10069   * Released under the General Public License (GPL).
10070   */
10071
10072 -#if defined(CONFIG_SMP)
10073 -# include <asm/spinlock_types.h>
10074 +#include <linux/spinlock_types_raw.h>
10075 +
10076 +#ifndef CONFIG_PREEMPT_RT_FULL
10077 +# include <linux/spinlock_types_nort.h>
10078 +# include <linux/rwlock_types.h>
10079  #else
10080 -# include <linux/spinlock_types_up.h>
10081 +# include <linux/rtmutex.h>
10082 +# include <linux/spinlock_types_rt.h>
10083 +# include <linux/rwlock_types_rt.h>
10084  #endif
10085
10086 -#include <linux/lockdep.h>
10087 -
10088 -typedef struct raw_spinlock {
10089 -       arch_spinlock_t raw_lock;
10090 -#ifdef CONFIG_GENERIC_LOCKBREAK
10091 -       unsigned int break_lock;
10092 -#endif
10093 -#ifdef CONFIG_DEBUG_SPINLOCK
10094 -       unsigned int magic, owner_cpu;
10095 -       void *owner;
10096 -#endif
10097 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10098 -       struct lockdep_map dep_map;
10099 -#endif
10100 -} raw_spinlock_t;
10101 -
10102 -#define SPINLOCK_MAGIC         0xdead4ead
10103 -
10104 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10105 -
10106 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10107 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10108 -#else
10109 -# define SPIN_DEP_MAP_INIT(lockname)
10110 -#endif
10111 -
10112 -#ifdef CONFIG_DEBUG_SPINLOCK
10113 -# define SPIN_DEBUG_INIT(lockname)             \
10114 -       .magic = SPINLOCK_MAGIC,                \
10115 -       .owner_cpu = -1,                        \
10116 -       .owner = SPINLOCK_OWNER_INIT,
10117 -#else
10118 -# define SPIN_DEBUG_INIT(lockname)
10119 -#endif
10120 -
10121 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10122 -       {                                       \
10123 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10124 -       SPIN_DEBUG_INIT(lockname)               \
10125 -       SPIN_DEP_MAP_INIT(lockname) }
10126 -
10127 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10128 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10129 -
10130 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10131 -
10132 -typedef struct spinlock {
10133 -       union {
10134 -               struct raw_spinlock rlock;
10135 -
10136 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10137 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10138 -               struct {
10139 -                       u8 __padding[LOCK_PADSIZE];
10140 -                       struct lockdep_map dep_map;
10141 -               };
10142 -#endif
10143 -       };
10144 -} spinlock_t;
10145 -
10146 -#define __SPIN_LOCK_INITIALIZER(lockname) \
10147 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10148 -
10149 -#define __SPIN_LOCK_UNLOCKED(lockname) \
10150 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10151 -
10152 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10153 -
10154 -#include <linux/rwlock_types.h>
10155 -
10156  #endif /* __LINUX_SPINLOCK_TYPES_H */
10157 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
10158 new file mode 100644
10159 index 000000000000..f1dac1fb1d6a
10160 --- /dev/null
10161 +++ b/include/linux/spinlock_types_nort.h
10162 @@ -0,0 +1,33 @@
10163 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
10164 +#define __LINUX_SPINLOCK_TYPES_NORT_H
10165 +
10166 +#ifndef __LINUX_SPINLOCK_TYPES_H
10167 +#error "Do not include directly. Include spinlock_types.h instead"
10168 +#endif
10169 +
10170 +/*
10171 + * The non RT version maps spinlocks to raw_spinlocks
10172 + */
10173 +typedef struct spinlock {
10174 +       union {
10175 +               struct raw_spinlock rlock;
10176 +
10177 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10178 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10179 +               struct {
10180 +                       u8 __padding[LOCK_PADSIZE];
10181 +                       struct lockdep_map dep_map;
10182 +               };
10183 +#endif
10184 +       };
10185 +} spinlock_t;
10186 +
10187 +#define __SPIN_LOCK_INITIALIZER(lockname) \
10188 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10189 +
10190 +#define __SPIN_LOCK_UNLOCKED(lockname) \
10191 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10192 +
10193 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10194 +
10195 +#endif
10196 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
10197 new file mode 100644
10198 index 000000000000..edffc4d53fc9
10199 --- /dev/null
10200 +++ b/include/linux/spinlock_types_raw.h
10201 @@ -0,0 +1,56 @@
10202 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
10203 +#define __LINUX_SPINLOCK_TYPES_RAW_H
10204 +
10205 +#if defined(CONFIG_SMP)
10206 +# include <asm/spinlock_types.h>
10207 +#else
10208 +# include <linux/spinlock_types_up.h>
10209 +#endif
10210 +
10211 +#include <linux/lockdep.h>
10212 +
10213 +typedef struct raw_spinlock {
10214 +       arch_spinlock_t raw_lock;
10215 +#ifdef CONFIG_GENERIC_LOCKBREAK
10216 +       unsigned int break_lock;
10217 +#endif
10218 +#ifdef CONFIG_DEBUG_SPINLOCK
10219 +       unsigned int magic, owner_cpu;
10220 +       void *owner;
10221 +#endif
10222 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10223 +       struct lockdep_map dep_map;
10224 +#endif
10225 +} raw_spinlock_t;
10226 +
10227 +#define SPINLOCK_MAGIC         0xdead4ead
10228 +
10229 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10230 +
10231 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10232 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10233 +#else
10234 +# define SPIN_DEP_MAP_INIT(lockname)
10235 +#endif
10236 +
10237 +#ifdef CONFIG_DEBUG_SPINLOCK
10238 +# define SPIN_DEBUG_INIT(lockname)             \
10239 +       .magic = SPINLOCK_MAGIC,                \
10240 +       .owner_cpu = -1,                        \
10241 +       .owner = SPINLOCK_OWNER_INIT,
10242 +#else
10243 +# define SPIN_DEBUG_INIT(lockname)
10244 +#endif
10245 +
10246 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10247 +       {                                       \
10248 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10249 +       SPIN_DEBUG_INIT(lockname)               \
10250 +       SPIN_DEP_MAP_INIT(lockname) }
10251 +
10252 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10253 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10254 +
10255 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10256 +
10257 +#endif
10258 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
10259 new file mode 100644
10260 index 000000000000..3e3d8c5f7a9a
10261 --- /dev/null
10262 +++ b/include/linux/spinlock_types_rt.h
10263 @@ -0,0 +1,48 @@
10264 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
10265 +#define __LINUX_SPINLOCK_TYPES_RT_H
10266 +
10267 +#ifndef __LINUX_SPINLOCK_TYPES_H
10268 +#error "Do not include directly. Include spinlock_types.h instead"
10269 +#endif
10270 +
10271 +#include <linux/cache.h>
10272 +
10273 +/*
10274 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
10275 + */
10276 +typedef struct spinlock {
10277 +       struct rt_mutex         lock;
10278 +       unsigned int            break_lock;
10279 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10280 +       struct lockdep_map      dep_map;
10281 +#endif
10282 +} spinlock_t;
10283 +
10284 +#ifdef CONFIG_DEBUG_RT_MUTEXES
10285 +# define __RT_SPIN_INITIALIZER(name) \
10286 +       { \
10287 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
10288 +       .save_state = 1, \
10289 +       .file = __FILE__, \
10290 +       .line = __LINE__ , \
10291 +       }
10292 +#else
10293 +# define __RT_SPIN_INITIALIZER(name) \
10294 +       {                                                               \
10295 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
10296 +       .save_state = 1, \
10297 +       }
10298 +#endif
10299 +
10300 +/*
10301 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
10302 +*/
10303 +
10304 +#define __SPIN_LOCK_UNLOCKED(name)                     \
10305 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
10306 +         SPIN_DEP_MAP_INIT(name) }
10307 +
10308 +#define DEFINE_SPINLOCK(name) \
10309 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
10310 +
10311 +#endif
10312 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
10313 index dc8eb63c6568..e793d3a257da 100644
10314 --- a/include/linux/srcu.h
10315 +++ b/include/linux/srcu.h
10316 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
10317
10318  void process_srcu(struct work_struct *work);
10319
10320 -#define __SRCU_STRUCT_INIT(name)                                       \
10321 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
10322         {                                                               \
10323                 .completed = -300,                                      \
10324 -               .per_cpu_ref = &name##_srcu_array,                      \
10325 +               .per_cpu_ref = &pcpu_name,                              \
10326                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
10327                 .running = false,                                       \
10328                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
10329 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
10330   */
10331  #define __DEFINE_SRCU(name, is_static)                                 \
10332         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
10333 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
10334 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
10335  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
10336  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
10337
10338 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
10339 index d9718378a8be..e81e6dc7dcb1 100644
10340 --- a/include/linux/suspend.h
10341 +++ b/include/linux/suspend.h
10342 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
10343         void (*end)(void);
10344  };
10345
10346 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
10347 +extern bool pm_in_action;
10348 +#else
10349 +# define pm_in_action false
10350 +#endif
10351 +
10352  #ifdef CONFIG_SUSPEND
10353  /**
10354   * suspend_set_ops - set platform dependent suspend operations
10355 diff --git a/include/linux/swait.h b/include/linux/swait.h
10356 index c1f9c62a8a50..83f004a72320 100644
10357 --- a/include/linux/swait.h
10358 +++ b/include/linux/swait.h
10359 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
10360  extern void swake_up(struct swait_queue_head *q);
10361  extern void swake_up_all(struct swait_queue_head *q);
10362  extern void swake_up_locked(struct swait_queue_head *q);
10363 +extern void swake_up_all_locked(struct swait_queue_head *q);
10364
10365  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
10366  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
10367 diff --git a/include/linux/swap.h b/include/linux/swap.h
10368 index 55ff5593c193..52bf5477dc92 100644
10369 --- a/include/linux/swap.h
10370 +++ b/include/linux/swap.h
10371 @@ -11,6 +11,7 @@
10372  #include <linux/fs.h>
10373  #include <linux/atomic.h>
10374  #include <linux/page-flags.h>
10375 +#include <linux/locallock.h>
10376  #include <asm/page.h>
10377
10378  struct notifier_block;
10379 @@ -247,7 +248,8 @@ struct swap_info_struct {
10380  void *workingset_eviction(struct address_space *mapping, struct page *page);
10381  bool workingset_refault(void *shadow);
10382  void workingset_activation(struct page *page);
10383 -extern struct list_lru workingset_shadow_nodes;
10384 +extern struct list_lru __workingset_shadow_nodes;
10385 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
10386
10387  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10388  {
10389 @@ -292,6 +294,7 @@ extern unsigned long nr_free_pagecache_pages(void);
10390
10391
10392  /* linux/mm/swap.c */
10393 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10394  extern void lru_cache_add(struct page *);
10395  extern void lru_cache_add_anon(struct page *page);
10396  extern void lru_cache_add_file(struct page *page);
10397 diff --git a/include/linux/swork.h b/include/linux/swork.h
10398 new file mode 100644
10399 index 000000000000..f175fa9a6016
10400 --- /dev/null
10401 +++ b/include/linux/swork.h
10402 @@ -0,0 +1,24 @@
10403 +#ifndef _LINUX_SWORK_H
10404 +#define _LINUX_SWORK_H
10405 +
10406 +#include <linux/list.h>
10407 +
10408 +struct swork_event {
10409 +       struct list_head item;
10410 +       unsigned long flags;
10411 +       void (*func)(struct swork_event *);
10412 +};
10413 +
10414 +static inline void INIT_SWORK(struct swork_event *event,
10415 +                             void (*func)(struct swork_event *))
10416 +{
10417 +       event->flags = 0;
10418 +       event->func = func;
10419 +}
10420 +
10421 +bool swork_queue(struct swork_event *sev);
10422 +
10423 +int swork_get(void);
10424 +void swork_put(void);
10425 +
10426 +#endif /* _LINUX_SWORK_H */
10427 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
10428 index 2873baf5372a..eb1a108f17ca 100644
10429 --- a/include/linux/thread_info.h
10430 +++ b/include/linux/thread_info.h
10431 @@ -107,7 +107,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
10432  #define test_thread_flag(flag) \
10433         test_ti_thread_flag(current_thread_info(), flag)
10434
10435 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10436 +#ifdef CONFIG_PREEMPT_LAZY
10437 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10438 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10439 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10440 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10441 +
10442 +#else
10443 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10444 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10445 +#define tif_need_resched_lazy()        0
10446 +#endif
10447
10448  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10449  static inline int arch_within_stack_frames(const void * const stack,
10450 diff --git a/include/linux/timer.h b/include/linux/timer.h
10451 index 51d601f192d4..83cea629efe1 100644
10452 --- a/include/linux/timer.h
10453 +++ b/include/linux/timer.h
10454 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
10455
10456  extern int try_to_del_timer_sync(struct timer_list *timer);
10457
10458 -#ifdef CONFIG_SMP
10459 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10460    extern int del_timer_sync(struct timer_list *timer);
10461  #else
10462  # define del_timer_sync(t)             del_timer(t)
10463 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
10464 index be007610ceb0..15154b13a53b 100644
10465 --- a/include/linux/trace_events.h
10466 +++ b/include/linux/trace_events.h
10467 @@ -56,6 +56,9 @@ struct trace_entry {
10468         unsigned char           flags;
10469         unsigned char           preempt_count;
10470         int                     pid;
10471 +       unsigned short          migrate_disable;
10472 +       unsigned short          padding;
10473 +       unsigned char           preempt_lazy_count;
10474  };
10475
10476  #define TRACE_EVENT_TYPE_MAX                                           \
10477 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
10478 index f30c187ed785..83bf0f798426 100644
10479 --- a/include/linux/uaccess.h
10480 +++ b/include/linux/uaccess.h
10481 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
10482   */
10483  static inline void pagefault_disable(void)
10484  {
10485 +       migrate_disable();
10486         pagefault_disabled_inc();
10487         /*
10488          * make sure to have issued the store before a pagefault
10489 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
10490          */
10491         barrier();
10492         pagefault_disabled_dec();
10493 +       migrate_enable();
10494  }
10495
10496  /*
10497 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
10498 index 4a29c75b146e..0a294e950df8 100644
10499 --- a/include/linux/uprobes.h
10500 +++ b/include/linux/uprobes.h
10501 @@ -27,6 +27,7 @@
10502  #include <linux/errno.h>
10503  #include <linux/rbtree.h>
10504  #include <linux/types.h>
10505 +#include <linux/wait.h>
10506
10507  struct vm_area_struct;
10508  struct mm_struct;
10509 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
10510 index 613771909b6e..e28c5a43229d 100644
10511 --- a/include/linux/vmstat.h
10512 +++ b/include/linux/vmstat.h
10513 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
10514   */
10515  static inline void __count_vm_event(enum vm_event_item item)
10516  {
10517 +       preempt_disable_rt();
10518         raw_cpu_inc(vm_event_states.event[item]);
10519 +       preempt_enable_rt();
10520  }
10521
10522  static inline void count_vm_event(enum vm_event_item item)
10523 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
10524
10525  static inline void __count_vm_events(enum vm_event_item item, long delta)
10526  {
10527 +       preempt_disable_rt();
10528         raw_cpu_add(vm_event_states.event[item], delta);
10529 +       preempt_enable_rt();
10530  }
10531
10532  static inline void count_vm_events(enum vm_event_item item, long delta)
10533 diff --git a/include/linux/wait.h b/include/linux/wait.h
10534 index 2408e8d5c05c..db50d6609195 100644
10535 --- a/include/linux/wait.h
10536 +++ b/include/linux/wait.h
10537 @@ -8,6 +8,7 @@
10538  #include <linux/spinlock.h>
10539  #include <asm/current.h>
10540  #include <uapi/linux/wait.h>
10541 +#include <linux/atomic.h>
10542
10543  typedef struct __wait_queue wait_queue_t;
10544  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10545 diff --git a/include/net/dst.h b/include/net/dst.h
10546 index 6835d224d47b..55a5a9698f14 100644
10547 --- a/include/net/dst.h
10548 +++ b/include/net/dst.h
10549 @@ -446,7 +446,7 @@ static inline void dst_confirm(struct dst_entry *dst)
10550  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10551                                    struct sk_buff *skb)
10552  {
10553 -       const struct hh_cache *hh;
10554 +       struct hh_cache *hh;
10555
10556         if (dst->pending_confirm) {
10557                 unsigned long now = jiffies;
10558 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
10559 index 231e121cc7d9..d125222b979d 100644
10560 --- a/include/net/gen_stats.h
10561 +++ b/include/net/gen_stats.h
10562 @@ -5,6 +5,7 @@
10563  #include <linux/socket.h>
10564  #include <linux/rtnetlink.h>
10565  #include <linux/pkt_sched.h>
10566 +#include <net/net_seq_lock.h>
10567
10568  struct gnet_stats_basic_cpu {
10569         struct gnet_stats_basic_packed bstats;
10570 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
10571                                  spinlock_t *lock, struct gnet_dump *d,
10572                                  int padattr);
10573
10574 -int gnet_stats_copy_basic(const seqcount_t *running,
10575 +int gnet_stats_copy_basic(net_seqlock_t *running,
10576                           struct gnet_dump *d,
10577                           struct gnet_stats_basic_cpu __percpu *cpu,
10578                           struct gnet_stats_basic_packed *b);
10579 -void __gnet_stats_copy_basic(const seqcount_t *running,
10580 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10581                              struct gnet_stats_basic_packed *bstats,
10582                              struct gnet_stats_basic_cpu __percpu *cpu,
10583                              struct gnet_stats_basic_packed *b);
10584 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
10585                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10586                       struct gnet_stats_rate_est64 *rate_est,
10587                       spinlock_t *stats_lock,
10588 -                     seqcount_t *running, struct nlattr *opt);
10589 +                     net_seqlock_t *running, struct nlattr *opt);
10590  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10591                         struct gnet_stats_rate_est64 *rate_est);
10592  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10593                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10594                           struct gnet_stats_rate_est64 *rate_est,
10595                           spinlock_t *stats_lock,
10596 -                         seqcount_t *running, struct nlattr *opt);
10597 +                         net_seqlock_t *running, struct nlattr *opt);
10598  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10599                           const struct gnet_stats_rate_est64 *rate_est);
10600  #endif
10601 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
10602 index 8b683841e574..bf656008f6e7 100644
10603 --- a/include/net/neighbour.h
10604 +++ b/include/net/neighbour.h
10605 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
10606  }
10607  #endif
10608
10609 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10610 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10611  {
10612         unsigned int seq;
10613         int hh_len;
10614 @@ -501,7 +501,7 @@ struct neighbour_cb {
10615
10616  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10617
10618 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10619 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10620                                      const struct net_device *dev)
10621  {
10622         unsigned int seq;
10623 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
10624 new file mode 100644
10625 index 000000000000..a7034298a82a
10626 --- /dev/null
10627 +++ b/include/net/net_seq_lock.h
10628 @@ -0,0 +1,15 @@
10629 +#ifndef __NET_NET_SEQ_LOCK_H__
10630 +#define __NET_NET_SEQ_LOCK_H__
10631 +
10632 +#ifdef CONFIG_PREEMPT_RT_BASE
10633 +# define net_seqlock_t                 seqlock_t
10634 +# define net_seq_begin(__r)            read_seqbegin(__r)
10635 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10636 +
10637 +#else
10638 +# define net_seqlock_t                 seqcount_t
10639 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10640 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10641 +#endif
10642 +
10643 +#endif
10644 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
10645 index 7adf4386ac8f..d3fd5c357268 100644
10646 --- a/include/net/netns/ipv4.h
10647 +++ b/include/net/netns/ipv4.h
10648 @@ -69,6 +69,7 @@ struct netns_ipv4 {
10649
10650         int sysctl_icmp_echo_ignore_all;
10651         int sysctl_icmp_echo_ignore_broadcasts;
10652 +       int sysctl_icmp_echo_sysrq;
10653         int sysctl_icmp_ignore_bogus_error_responses;
10654         int sysctl_icmp_ratelimit;
10655         int sysctl_icmp_ratemask;
10656 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
10657 index e6aa0a249672..b57736f2a8a3 100644
10658 --- a/include/net/sch_generic.h
10659 +++ b/include/net/sch_generic.h
10660 @@ -10,6 +10,7 @@
10661  #include <linux/dynamic_queue_limits.h>
10662  #include <net/gen_stats.h>
10663  #include <net/rtnetlink.h>
10664 +#include <net/net_seq_lock.h>
10665
10666  struct Qdisc_ops;
10667  struct qdisc_walker;
10668 @@ -86,7 +87,7 @@ struct Qdisc {
10669         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10670         struct qdisc_skb_head   q;
10671         struct gnet_stats_basic_packed bstats;
10672 -       seqcount_t              running;
10673 +       net_seqlock_t           running;
10674         struct gnet_stats_queue qstats;
10675         unsigned long           state;
10676         struct Qdisc            *next_sched;
10677 @@ -98,13 +99,22 @@ struct Qdisc {
10678         spinlock_t              busylock ____cacheline_aligned_in_smp;
10679  };
10680
10681 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10682 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10683  {
10684 +#ifdef CONFIG_PREEMPT_RT_BASE
10685 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10686 +#else
10687         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10688 +#endif
10689  }
10690
10691  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10692  {
10693 +#ifdef CONFIG_PREEMPT_RT_BASE
10694 +       if (try_write_seqlock(&qdisc->running))
10695 +               return true;
10696 +       return false;
10697 +#else
10698         if (qdisc_is_running(qdisc))
10699                 return false;
10700         /* Variant of write_seqcount_begin() telling lockdep a trylock
10701 @@ -113,11 +123,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10702         raw_write_seqcount_begin(&qdisc->running);
10703         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10704         return true;
10705 +#endif
10706  }
10707
10708  static inline void qdisc_run_end(struct Qdisc *qdisc)
10709  {
10710 +#ifdef CONFIG_PREEMPT_RT_BASE
10711 +       write_sequnlock(&qdisc->running);
10712 +#else
10713         write_seqcount_end(&qdisc->running);
10714 +#endif
10715  }
10716
10717  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10718 @@ -308,7 +323,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
10719         return qdisc_lock(root);
10720  }
10721
10722 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10723 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10724  {
10725         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10726
10727 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
10728 new file mode 100644
10729 index 000000000000..f7710de1b1f3
10730 --- /dev/null
10731 +++ b/include/trace/events/hist.h
10732 @@ -0,0 +1,73 @@
10733 +#undef TRACE_SYSTEM
10734 +#define TRACE_SYSTEM hist
10735 +
10736 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10737 +#define _TRACE_HIST_H
10738 +
10739 +#include "latency_hist.h"
10740 +#include <linux/tracepoint.h>
10741 +
10742 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10743 +#define trace_preemptirqsoff_hist(a, b)
10744 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10745 +#else
10746 +TRACE_EVENT(preemptirqsoff_hist,
10747 +
10748 +       TP_PROTO(int reason, int starthist),
10749 +
10750 +       TP_ARGS(reason, starthist),
10751 +
10752 +       TP_STRUCT__entry(
10753 +               __field(int,    reason)
10754 +               __field(int,    starthist)
10755 +       ),
10756 +
10757 +       TP_fast_assign(
10758 +               __entry->reason         = reason;
10759 +               __entry->starthist      = starthist;
10760 +       ),
10761 +
10762 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10763 +                 __entry->starthist ? "start" : "stop")
10764 +);
10765 +#endif
10766 +
10767 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10768 +#define trace_hrtimer_interrupt(a, b, c, d)
10769 +#else
10770 +TRACE_EVENT(hrtimer_interrupt,
10771 +
10772 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10773 +               struct task_struct *task),
10774 +
10775 +       TP_ARGS(cpu, offset, curr, task),
10776 +
10777 +       TP_STRUCT__entry(
10778 +               __field(int,            cpu)
10779 +               __field(long long,      offset)
10780 +               __array(char,           ccomm,  TASK_COMM_LEN)
10781 +               __field(int,            cprio)
10782 +               __array(char,           tcomm,  TASK_COMM_LEN)
10783 +               __field(int,            tprio)
10784 +       ),
10785 +
10786 +       TP_fast_assign(
10787 +               __entry->cpu    = cpu;
10788 +               __entry->offset = offset;
10789 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10790 +               __entry->cprio  = curr->prio;
10791 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10792 +                       task != NULL ? TASK_COMM_LEN : 7);
10793 +               __entry->tprio  = task != NULL ? task->prio : -1;
10794 +       ),
10795 +
10796 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10797 +               __entry->cpu, __entry->offset, __entry->ccomm,
10798 +               __entry->cprio, __entry->tcomm, __entry->tprio)
10799 +);
10800 +#endif
10801 +
10802 +#endif /* _TRACE_HIST_H */
10803 +
10804 +/* This part must be outside protection */
10805 +#include <trace/define_trace.h>
10806 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
10807 new file mode 100644
10808 index 000000000000..d3f2fbd560b1
10809 --- /dev/null
10810 +++ b/include/trace/events/latency_hist.h
10811 @@ -0,0 +1,29 @@
10812 +#ifndef _LATENCY_HIST_H
10813 +#define _LATENCY_HIST_H
10814 +
10815 +enum hist_action {
10816 +       IRQS_ON,
10817 +       PREEMPT_ON,
10818 +       TRACE_STOP,
10819 +       IRQS_OFF,
10820 +       PREEMPT_OFF,
10821 +       TRACE_START,
10822 +};
10823 +
10824 +static char *actions[] = {
10825 +       "IRQS_ON",
10826 +       "PREEMPT_ON",
10827 +       "TRACE_STOP",
10828 +       "IRQS_OFF",
10829 +       "PREEMPT_OFF",
10830 +       "TRACE_START",
10831 +};
10832 +
10833 +static inline char *getaction(int action)
10834 +{
10835 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10836 +               return actions[action];
10837 +       return "unknown";
10838 +}
10839 +
10840 +#endif /* _LATENCY_HIST_H */
10841 diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
10842 index 9b90c57517a9..516ae88cddf4 100644
10843 --- a/include/trace/events/sched.h
10844 +++ b/include/trace/events/sched.h
10845 @@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
10846         TP_fast_assign(
10847                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
10848                 __entry->pid            = p->pid;
10849 -               __entry->prio           = p->prio;
10850 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
10851                 __entry->success        = 1; /* rudiment, kill when possible */
10852                 __entry->target_cpu     = task_cpu(p);
10853         ),
10854 @@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch,
10855                 memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
10856                 __entry->next_pid       = next->pid;
10857                 __entry->next_prio      = next->prio;
10858 +               /* XXX SCHED_DEADLINE */
10859         ),
10860
10861         TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
10862 @@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task,
10863         TP_fast_assign(
10864                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
10865                 __entry->pid            = p->pid;
10866 -               __entry->prio           = p->prio;
10867 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
10868                 __entry->orig_cpu       = task_cpu(p);
10869                 __entry->dest_cpu       = dest_cpu;
10870         ),
10871 @@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
10872         TP_fast_assign(
10873                 memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
10874                 __entry->pid            = p->pid;
10875 -               __entry->prio           = p->prio;
10876 +               __entry->prio           = p->prio; /* XXX SCHED_DEADLINE */
10877         ),
10878
10879         TP_printk("comm=%s pid=%d prio=%d",
10880 @@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait,
10881         TP_fast_assign(
10882                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
10883                 __entry->pid            = pid_nr(pid);
10884 -               __entry->prio           = current->prio;
10885 +               __entry->prio           = current->prio; /* XXX SCHED_DEADLINE */
10886         ),
10887
10888         TP_printk("comm=%s pid=%d prio=%d",
10889 @@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
10890   */
10891  TRACE_EVENT(sched_pi_setprio,
10892
10893 -       TP_PROTO(struct task_struct *tsk, int newprio),
10894 +       TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
10895
10896 -       TP_ARGS(tsk, newprio),
10897 +       TP_ARGS(tsk, pi_task),
10898
10899         TP_STRUCT__entry(
10900                 __array( char,  comm,   TASK_COMM_LEN   )
10901 @@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio,
10902                 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
10903                 __entry->pid            = tsk->pid;
10904                 __entry->oldprio        = tsk->prio;
10905 -               __entry->newprio        = newprio;
10906 +               __entry->newprio        = pi_task ? pi_task->prio : tsk->prio;
10907 +               /* XXX SCHED_DEADLINE bits missing */
10908         ),
10909
10910         TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
10911 diff --git a/init/Kconfig b/init/Kconfig
10912 index 34407f15e6d3..2ce33a32e65d 100644
10913 --- a/init/Kconfig
10914 +++ b/init/Kconfig
10915 @@ -506,7 +506,7 @@ config TINY_RCU
10916
10917  config RCU_EXPERT
10918         bool "Make expert-level adjustments to RCU configuration"
10919 -       default n
10920 +       default y if PREEMPT_RT_FULL
10921         help
10922           This option needs to be enabled if you wish to make
10923           expert-level adjustments to RCU configuration.  By default,
10924 @@ -623,7 +623,7 @@ config RCU_FANOUT_LEAF
10925
10926  config RCU_FAST_NO_HZ
10927         bool "Accelerate last non-dyntick-idle CPU's grace periods"
10928 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10929 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10930         default n
10931         help
10932           This option permits CPUs to enter dynticks-idle state even if
10933 @@ -650,7 +650,7 @@ config TREE_RCU_TRACE
10934  config RCU_BOOST
10935         bool "Enable RCU priority boosting"
10936         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10937 -       default n
10938 +       default y if PREEMPT_RT_FULL
10939         help
10940           This option boosts the priority of preempted RCU readers that
10941           block the current preemptible RCU grace period for too long.
10942 @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
10943
10944  endchoice
10945
10946 -config RCU_EXPEDITE_BOOT
10947 -       bool
10948 -       default n
10949 -       help
10950 -         This option enables expedited grace periods at boot time,
10951 -         as if rcu_expedite_gp() had been invoked early in boot.
10952 -         The corresponding rcu_unexpedite_gp() is invoked from
10953 -         rcu_end_inkernel_boot(), which is intended to be invoked
10954 -         at the end of the kernel-only boot sequence, just before
10955 -         init is exec'ed.
10956 -
10957 -         Accept the default if unsure.
10958 -
10959  endmenu # "RCU Subsystem"
10960
10961  config BUILD_BIN2C
10962 @@ -1064,6 +1051,7 @@ config CFS_BANDWIDTH
10963  config RT_GROUP_SCHED
10964         bool "Group scheduling for SCHED_RR/FIFO"
10965         depends on CGROUP_SCHED
10966 +       depends on !PREEMPT_RT_FULL
10967         default n
10968         help
10969           This feature lets you explicitly allocate real CPU bandwidth
10970 @@ -1772,6 +1760,7 @@ choice
10971
10972  config SLAB
10973         bool "SLAB"
10974 +       depends on !PREEMPT_RT_FULL
10975         select HAVE_HARDENED_USERCOPY_ALLOCATOR
10976         help
10977           The regular slab allocator that is established and known to work
10978 @@ -1792,6 +1781,7 @@ config SLUB
10979  config SLOB
10980         depends on EXPERT
10981         bool "SLOB (Simple Allocator)"
10982 +       depends on !PREEMPT_RT_FULL
10983         help
10984            SLOB replaces the stock allocator with a drastically simpler
10985            allocator. SLOB is generally more space efficient but
10986 @@ -1810,7 +1800,7 @@ config SLAB_FREELIST_RANDOM
10987
10988  config SLUB_CPU_PARTIAL
10989         default y
10990 -       depends on SLUB && SMP
10991 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
10992         bool "SLUB per cpu partial cache"
10993         help
10994           Per cpu partial caches accellerate objects allocation and freeing
10995 diff --git a/init/Makefile b/init/Makefile
10996 index c4fb45525d08..821190dfaa75 100644
10997 --- a/init/Makefile
10998 +++ b/init/Makefile
10999 @@ -35,4 +35,4 @@ $(obj)/version.o: include/generated/compile.h
11000  include/generated/compile.h: FORCE
11001         @$($(quiet)chk_compile.h)
11002         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
11003 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
11004 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
11005 diff --git a/init/main.c b/init/main.c
11006 index ae3996ae9bac..6470deef01c9 100644
11007 --- a/init/main.c
11008 +++ b/init/main.c
11009 @@ -507,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
11010         setup_command_line(command_line);
11011         setup_nr_cpu_ids();
11012         setup_per_cpu_areas();
11013 +       softirq_early_init();
11014         boot_cpu_state_init();
11015         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
11016
11017 diff --git a/ipc/sem.c b/ipc/sem.c
11018 index 10b94bc59d4a..b8360eaacc7a 100644
11019 --- a/ipc/sem.c
11020 +++ b/ipc/sem.c
11021 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
11022  static void wake_up_sem_queue_prepare(struct list_head *pt,
11023                                 struct sem_queue *q, int error)
11024  {
11025 +#ifdef CONFIG_PREEMPT_RT_BASE
11026 +       struct task_struct *p = q->sleeper;
11027 +       get_task_struct(p);
11028 +       q->status = error;
11029 +       wake_up_process(p);
11030 +       put_task_struct(p);
11031 +#else
11032         if (list_empty(pt)) {
11033                 /*
11034                  * Hold preempt off so that we don't get preempted and have the
11035 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
11036         q->pid = error;
11037
11038         list_add_tail(&q->list, pt);
11039 +#endif
11040  }
11041
11042  /**
11043 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
11044   */
11045  static void wake_up_sem_queue_do(struct list_head *pt)
11046  {
11047 +#ifndef CONFIG_PREEMPT_RT_BASE
11048         struct sem_queue *q, *t;
11049         int did_something;
11050
11051 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
11052         }
11053         if (did_something)
11054                 preempt_enable();
11055 +#endif
11056  }
11057
11058  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
11059 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
11060 index ebdb0043203a..b9e6aa7e5aa6 100644
11061 --- a/kernel/Kconfig.locks
11062 +++ b/kernel/Kconfig.locks
11063 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
11064
11065  config MUTEX_SPIN_ON_OWNER
11066         def_bool y
11067 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
11068 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11069
11070  config RWSEM_SPIN_ON_OWNER
11071         def_bool y
11072 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
11073 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
11074
11075  config LOCK_SPIN_ON_OWNER
11076         def_bool y
11077 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
11078 index 3f9c97419f02..11dbe26a8279 100644
11079 --- a/kernel/Kconfig.preempt
11080 +++ b/kernel/Kconfig.preempt
11081 @@ -1,3 +1,16 @@
11082 +config PREEMPT
11083 +       bool
11084 +       select PREEMPT_COUNT
11085 +
11086 +config PREEMPT_RT_BASE
11087 +       bool
11088 +       select PREEMPT
11089 +
11090 +config HAVE_PREEMPT_LAZY
11091 +       bool
11092 +
11093 +config PREEMPT_LAZY
11094 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
11095
11096  choice
11097         prompt "Preemption Model"
11098 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
11099
11100           Select this if you are building a kernel for a desktop system.
11101
11102 -config PREEMPT
11103 +config PREEMPT__LL
11104         bool "Preemptible Kernel (Low-Latency Desktop)"
11105 -       select PREEMPT_COUNT
11106 +       select PREEMPT
11107         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
11108         help
11109           This option reduces the latency of the kernel by making
11110 @@ -52,6 +65,22 @@ config PREEMPT
11111           embedded system with latency requirements in the milliseconds
11112           range.
11113
11114 +config PREEMPT_RTB
11115 +       bool "Preemptible Kernel (Basic RT)"
11116 +       select PREEMPT_RT_BASE
11117 +       help
11118 +         This option is basically the same as (Low-Latency Desktop) but
11119 +         enables changes which are preliminary for the full preemptible
11120 +         RT kernel.
11121 +
11122 +config PREEMPT_RT_FULL
11123 +       bool "Fully Preemptible Kernel (RT)"
11124 +       depends on IRQ_FORCED_THREADING
11125 +       select PREEMPT_RT_BASE
11126 +       select PREEMPT_RCU
11127 +       help
11128 +         All and everything
11129 +
11130  endchoice
11131
11132  config PREEMPT_COUNT
11133 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
11134 index a3d2aad2443f..bb6b252648ff 100644
11135 --- a/kernel/cgroup.c
11136 +++ b/kernel/cgroup.c
11137 @@ -5041,10 +5041,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
11138         queue_work(cgroup_destroy_wq, &css->destroy_work);
11139  }
11140
11141 -static void css_release_work_fn(struct work_struct *work)
11142 +static void css_release_work_fn(struct swork_event *sev)
11143  {
11144         struct cgroup_subsys_state *css =
11145 -               container_of(work, struct cgroup_subsys_state, destroy_work);
11146 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
11147         struct cgroup_subsys *ss = css->ss;
11148         struct cgroup *cgrp = css->cgroup;
11149
11150 @@ -5087,8 +5087,8 @@ static void css_release(struct percpu_ref *ref)
11151         struct cgroup_subsys_state *css =
11152                 container_of(ref, struct cgroup_subsys_state, refcnt);
11153
11154 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
11155 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
11156 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
11157 +       swork_queue(&css->destroy_swork);
11158  }
11159
11160  static void init_and_link_css(struct cgroup_subsys_state *css,
11161 @@ -5740,6 +5740,7 @@ static int __init cgroup_wq_init(void)
11162          */
11163         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
11164         BUG_ON(!cgroup_destroy_wq);
11165 +       BUG_ON(swork_get());
11166
11167         /*
11168          * Used to destroy pidlists and separate to serve as flush domain.
11169 diff --git a/kernel/cpu.c b/kernel/cpu.c
11170 index 99c6c568bc55..f1c64e563970 100644
11171 --- a/kernel/cpu.c
11172 +++ b/kernel/cpu.c
11173 @@ -239,6 +239,289 @@ static struct {
11174  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
11175  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
11176
11177 +/**
11178 + * hotplug_pcp - per cpu hotplug descriptor
11179 + * @unplug:    set when pin_current_cpu() needs to sync tasks
11180 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
11181 + * @refcount:  counter of tasks in pinned sections
11182 + * @grab_lock: set when the tasks entering pinned sections should wait
11183 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
11184 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
11185 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
11186 + *
11187 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
11188 + * is used as a flag and still exists after @sync_tsk has exited and
11189 + * @sync_tsk set to NULL.
11190 + */
11191 +struct hotplug_pcp {
11192 +       struct task_struct *unplug;
11193 +       struct task_struct *sync_tsk;
11194 +       int refcount;
11195 +       int grab_lock;
11196 +       struct completion synced;
11197 +       struct completion unplug_wait;
11198 +#ifdef CONFIG_PREEMPT_RT_FULL
11199 +       /*
11200 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
11201 +        * the task, otherwise the mutex will cause the task to fail
11202 +        * to sleep when required. (Because it's called from migrate_disable())
11203 +        *
11204 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
11205 +        * state.
11206 +        */
11207 +       spinlock_t lock;
11208 +#else
11209 +       struct mutex mutex;
11210 +#endif
11211 +       int mutex_init;
11212 +};
11213 +
11214 +#ifdef CONFIG_PREEMPT_RT_FULL
11215 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
11216 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
11217 +#else
11218 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
11219 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
11220 +#endif
11221 +
11222 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
11223 +
11224 +/**
11225 + * pin_current_cpu - Prevent the current cpu from being unplugged
11226 + *
11227 + * Lightweight version of get_online_cpus() to prevent cpu from being
11228 + * unplugged when code runs in a migration disabled region.
11229 + *
11230 + * Must be called with preemption disabled (preempt_count = 1)!
11231 + */
11232 +void pin_current_cpu(void)
11233 +{
11234 +       struct hotplug_pcp *hp;
11235 +       int force = 0;
11236 +
11237 +retry:
11238 +       hp = this_cpu_ptr(&hotplug_pcp);
11239 +
11240 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
11241 +           hp->unplug == current) {
11242 +               hp->refcount++;
11243 +               return;
11244 +       }
11245 +       if (hp->grab_lock) {
11246 +               preempt_enable();
11247 +               hotplug_lock(hp);
11248 +               hotplug_unlock(hp);
11249 +       } else {
11250 +               preempt_enable();
11251 +               /*
11252 +                * Try to push this task off of this CPU.
11253 +                */
11254 +               if (!migrate_me()) {
11255 +                       preempt_disable();
11256 +                       hp = this_cpu_ptr(&hotplug_pcp);
11257 +                       if (!hp->grab_lock) {
11258 +                               /*
11259 +                                * Just let it continue it's already pinned
11260 +                                * or about to sleep.
11261 +                                */
11262 +                               force = 1;
11263 +                               goto retry;
11264 +                       }
11265 +                       preempt_enable();
11266 +               }
11267 +       }
11268 +       preempt_disable();
11269 +       goto retry;
11270 +}
11271 +
11272 +/**
11273 + * unpin_current_cpu - Allow unplug of current cpu
11274 + *
11275 + * Must be called with preemption or interrupts disabled!
11276 + */
11277 +void unpin_current_cpu(void)
11278 +{
11279 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
11280 +
11281 +       WARN_ON(hp->refcount <= 0);
11282 +
11283 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
11284 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
11285 +               wake_up_process(hp->unplug);
11286 +}
11287 +
11288 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
11289 +{
11290 +       set_current_state(TASK_UNINTERRUPTIBLE);
11291 +       while (hp->refcount) {
11292 +               schedule_preempt_disabled();
11293 +               set_current_state(TASK_UNINTERRUPTIBLE);
11294 +       }
11295 +}
11296 +
11297 +static int sync_unplug_thread(void *data)
11298 +{
11299 +       struct hotplug_pcp *hp = data;
11300 +
11301 +       wait_for_completion(&hp->unplug_wait);
11302 +       preempt_disable();
11303 +       hp->unplug = current;
11304 +       wait_for_pinned_cpus(hp);
11305 +
11306 +       /*
11307 +        * This thread will synchronize the cpu_down() with threads
11308 +        * that have pinned the CPU. When the pinned CPU count reaches
11309 +        * zero, we inform the cpu_down code to continue to the next step.
11310 +        */
11311 +       set_current_state(TASK_UNINTERRUPTIBLE);
11312 +       preempt_enable();
11313 +       complete(&hp->synced);
11314 +
11315 +       /*
11316 +        * If all succeeds, the next step will need tasks to wait till
11317 +        * the CPU is offline before continuing. To do this, the grab_lock
11318 +        * is set and tasks going into pin_current_cpu() will block on the
11319 +        * mutex. But we still need to wait for those that are already in
11320 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
11321 +        * will kick this thread out.
11322 +        */
11323 +       while (!hp->grab_lock && !kthread_should_stop()) {
11324 +               schedule();
11325 +               set_current_state(TASK_UNINTERRUPTIBLE);
11326 +       }
11327 +
11328 +       /* Make sure grab_lock is seen before we see a stale completion */
11329 +       smp_mb();
11330 +
11331 +       /*
11332 +        * Now just before cpu_down() enters stop machine, we need to make
11333 +        * sure all tasks that are in pinned CPU sections are out, and new
11334 +        * tasks will now grab the lock, keeping them from entering pinned
11335 +        * CPU sections.
11336 +        */
11337 +       if (!kthread_should_stop()) {
11338 +               preempt_disable();
11339 +               wait_for_pinned_cpus(hp);
11340 +               preempt_enable();
11341 +               complete(&hp->synced);
11342 +       }
11343 +
11344 +       set_current_state(TASK_UNINTERRUPTIBLE);
11345 +       while (!kthread_should_stop()) {
11346 +               schedule();
11347 +               set_current_state(TASK_UNINTERRUPTIBLE);
11348 +       }
11349 +       set_current_state(TASK_RUNNING);
11350 +
11351 +       /*
11352 +        * Force this thread off this CPU as it's going down and
11353 +        * we don't want any more work on this CPU.
11354 +        */
11355 +       current->flags &= ~PF_NO_SETAFFINITY;
11356 +       set_cpus_allowed_ptr(current, cpu_present_mask);
11357 +       migrate_me();
11358 +       return 0;
11359 +}
11360 +
11361 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
11362 +{
11363 +       wake_up_process(hp->sync_tsk);
11364 +       wait_for_completion(&hp->synced);
11365 +}
11366 +
11367 +static void __cpu_unplug_wait(unsigned int cpu)
11368 +{
11369 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11370 +
11371 +       complete(&hp->unplug_wait);
11372 +       wait_for_completion(&hp->synced);
11373 +}
11374 +
11375 +/*
11376 + * Start the sync_unplug_thread on the target cpu and wait for it to
11377 + * complete.
11378 + */
11379 +static int cpu_unplug_begin(unsigned int cpu)
11380 +{
11381 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11382 +       int err;
11383 +
11384 +       /* Protected by cpu_hotplug.lock */
11385 +       if (!hp->mutex_init) {
11386 +#ifdef CONFIG_PREEMPT_RT_FULL
11387 +               spin_lock_init(&hp->lock);
11388 +#else
11389 +               mutex_init(&hp->mutex);
11390 +#endif
11391 +               hp->mutex_init = 1;
11392 +       }
11393 +
11394 +       /* Inform the scheduler to migrate tasks off this CPU */
11395 +       tell_sched_cpu_down_begin(cpu);
11396 +
11397 +       init_completion(&hp->synced);
11398 +       init_completion(&hp->unplug_wait);
11399 +
11400 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
11401 +       if (IS_ERR(hp->sync_tsk)) {
11402 +               err = PTR_ERR(hp->sync_tsk);
11403 +               hp->sync_tsk = NULL;
11404 +               return err;
11405 +       }
11406 +       kthread_bind(hp->sync_tsk, cpu);
11407 +
11408 +       /*
11409 +        * Wait for tasks to get out of the pinned sections,
11410 +        * it's still OK if new tasks enter. Some CPU notifiers will
11411 +        * wait for tasks that are going to enter these sections and
11412 +        * we must not have them block.
11413 +        */
11414 +       wake_up_process(hp->sync_tsk);
11415 +       return 0;
11416 +}
11417 +
11418 +static void cpu_unplug_sync(unsigned int cpu)
11419 +{
11420 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11421 +
11422 +       init_completion(&hp->synced);
11423 +       /* The completion needs to be initialzied before setting grab_lock */
11424 +       smp_wmb();
11425 +
11426 +       /* Grab the mutex before setting grab_lock */
11427 +       hotplug_lock(hp);
11428 +       hp->grab_lock = 1;
11429 +
11430 +       /*
11431 +        * The CPU notifiers have been completed.
11432 +        * Wait for tasks to get out of pinned CPU sections and have new
11433 +        * tasks block until the CPU is completely down.
11434 +        */
11435 +       __cpu_unplug_sync(hp);
11436 +
11437 +       /* All done with the sync thread */
11438 +       kthread_stop(hp->sync_tsk);
11439 +       hp->sync_tsk = NULL;
11440 +}
11441 +
11442 +static void cpu_unplug_done(unsigned int cpu)
11443 +{
11444 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11445 +
11446 +       hp->unplug = NULL;
11447 +       /* Let all tasks know cpu unplug is finished before cleaning up */
11448 +       smp_wmb();
11449 +
11450 +       if (hp->sync_tsk)
11451 +               kthread_stop(hp->sync_tsk);
11452 +
11453 +       if (hp->grab_lock) {
11454 +               hotplug_unlock(hp);
11455 +               /* protected by cpu_hotplug.lock */
11456 +               hp->grab_lock = 0;
11457 +       }
11458 +       tell_sched_cpu_down_done(cpu);
11459 +}
11460
11461  void get_online_cpus(void)
11462  {
11463 @@ -789,10 +1072,14 @@ static int takedown_cpu(unsigned int cpu)
11464         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11465         int err;
11466
11467 +       __cpu_unplug_wait(cpu);
11468         /* Park the smpboot threads */
11469         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
11470         smpboot_park_threads(cpu);
11471
11472 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
11473 +       cpu_unplug_sync(cpu);
11474 +
11475         /*
11476          * Prevent irq alloc/free while the dying cpu reorganizes the
11477          * interrupt affinities.
11478 @@ -877,6 +1164,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11479         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11480         int prev_state, ret = 0;
11481         bool hasdied = false;
11482 +       int mycpu;
11483 +       cpumask_var_t cpumask;
11484 +       cpumask_var_t cpumask_org;
11485
11486         if (num_online_cpus() == 1)
11487                 return -EBUSY;
11488 @@ -884,7 +1174,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11489         if (!cpu_present(cpu))
11490                 return -EINVAL;
11491
11492 +       /* Move the downtaker off the unplug cpu */
11493 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11494 +               return -ENOMEM;
11495 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11496 +               free_cpumask_var(cpumask);
11497 +               return -ENOMEM;
11498 +       }
11499 +
11500 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11501 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11502 +       set_cpus_allowed_ptr(current, cpumask);
11503 +       free_cpumask_var(cpumask);
11504 +       migrate_disable();
11505 +       mycpu = smp_processor_id();
11506 +       if (mycpu == cpu) {
11507 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11508 +               migrate_enable();
11509 +               ret = -EBUSY;
11510 +               goto restore_cpus;
11511 +       }
11512 +
11513 +       migrate_enable();
11514         cpu_hotplug_begin();
11515 +       ret = cpu_unplug_begin(cpu);
11516 +       if (ret) {
11517 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11518 +               goto out_cancel;
11519 +       }
11520
11521         cpuhp_tasks_frozen = tasks_frozen;
11522
11523 @@ -923,10 +1240,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11524
11525         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11526  out:
11527 +       cpu_unplug_done(cpu);
11528 +out_cancel:
11529         cpu_hotplug_done();
11530         /* This post dead nonsense must die */
11531         if (!ret && hasdied)
11532                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11533 +restore_cpus:
11534 +       set_cpus_allowed_ptr(current, cpumask_org);
11535 +       free_cpumask_var(cpumask_org);
11536         return ret;
11537  }
11538
11539 @@ -1240,6 +1562,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
11540
11541  #endif /* CONFIG_PM_SLEEP_SMP */
11542
11543 +int __boot_cpu_id;
11544 +
11545  #endif /* CONFIG_SMP */
11546
11547  /* Boot processor state steps */
11548 @@ -1924,6 +2248,10 @@ void __init boot_cpu_init(void)
11549         set_cpu_active(cpu, true);
11550         set_cpu_present(cpu, true);
11551         set_cpu_possible(cpu, true);
11552 +
11553 +#ifdef CONFIG_SMP
11554 +       __boot_cpu_id = cpu;
11555 +#endif
11556  }
11557
11558  /*
11559 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
11560 index 29f815d2ef7e..341b17f24f95 100644
11561 --- a/kernel/cpuset.c
11562 +++ b/kernel/cpuset.c
11563 @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
11564   */
11565
11566  static DEFINE_MUTEX(cpuset_mutex);
11567 -static DEFINE_SPINLOCK(callback_lock);
11568 +static DEFINE_RAW_SPINLOCK(callback_lock);
11569
11570  static struct workqueue_struct *cpuset_migrate_mm_wq;
11571
11572 @@ -907,9 +907,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
11573                         continue;
11574                 rcu_read_unlock();
11575
11576 -               spin_lock_irq(&callback_lock);
11577 +               raw_spin_lock_irq(&callback_lock);
11578                 cpumask_copy(cp->effective_cpus, new_cpus);
11579 -               spin_unlock_irq(&callback_lock);
11580 +               raw_spin_unlock_irq(&callback_lock);
11581
11582                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11583                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
11584 @@ -974,9 +974,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
11585         if (retval < 0)
11586                 return retval;
11587
11588 -       spin_lock_irq(&callback_lock);
11589 +       raw_spin_lock_irq(&callback_lock);
11590         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
11591 -       spin_unlock_irq(&callback_lock);
11592 +       raw_spin_unlock_irq(&callback_lock);
11593
11594         /* use trialcs->cpus_allowed as a temp variable */
11595         update_cpumasks_hier(cs, trialcs->cpus_allowed);
11596 @@ -1176,9 +1176,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
11597                         continue;
11598                 rcu_read_unlock();
11599
11600 -               spin_lock_irq(&callback_lock);
11601 +               raw_spin_lock_irq(&callback_lock);
11602                 cp->effective_mems = *new_mems;
11603 -               spin_unlock_irq(&callback_lock);
11604 +               raw_spin_unlock_irq(&callback_lock);
11605
11606                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11607                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
11608 @@ -1246,9 +1246,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
11609         if (retval < 0)
11610                 goto done;
11611
11612 -       spin_lock_irq(&callback_lock);
11613 +       raw_spin_lock_irq(&callback_lock);
11614         cs->mems_allowed = trialcs->mems_allowed;
11615 -       spin_unlock_irq(&callback_lock);
11616 +       raw_spin_unlock_irq(&callback_lock);
11617
11618         /* use trialcs->mems_allowed as a temp variable */
11619         update_nodemasks_hier(cs, &trialcs->mems_allowed);
11620 @@ -1339,9 +1339,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
11621         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
11622                         || (is_spread_page(cs) != is_spread_page(trialcs)));
11623
11624 -       spin_lock_irq(&callback_lock);
11625 +       raw_spin_lock_irq(&callback_lock);
11626         cs->flags = trialcs->flags;
11627 -       spin_unlock_irq(&callback_lock);
11628 +       raw_spin_unlock_irq(&callback_lock);
11629
11630         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
11631                 rebuild_sched_domains_locked();
11632 @@ -1756,7 +1756,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11633         cpuset_filetype_t type = seq_cft(sf)->private;
11634         int ret = 0;
11635
11636 -       spin_lock_irq(&callback_lock);
11637 +       raw_spin_lock_irq(&callback_lock);
11638
11639         switch (type) {
11640         case FILE_CPULIST:
11641 @@ -1775,7 +1775,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11642                 ret = -EINVAL;
11643         }
11644
11645 -       spin_unlock_irq(&callback_lock);
11646 +       raw_spin_unlock_irq(&callback_lock);
11647         return ret;
11648  }
11649
11650 @@ -1989,12 +1989,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
11651
11652         cpuset_inc();
11653
11654 -       spin_lock_irq(&callback_lock);
11655 +       raw_spin_lock_irq(&callback_lock);
11656         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11657                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
11658                 cs->effective_mems = parent->effective_mems;
11659         }
11660 -       spin_unlock_irq(&callback_lock);
11661 +       raw_spin_unlock_irq(&callback_lock);
11662
11663         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
11664                 goto out_unlock;
11665 @@ -2021,12 +2021,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
11666         }
11667         rcu_read_unlock();
11668
11669 -       spin_lock_irq(&callback_lock);
11670 +       raw_spin_lock_irq(&callback_lock);
11671         cs->mems_allowed = parent->mems_allowed;
11672         cs->effective_mems = parent->mems_allowed;
11673         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
11674         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
11675 -       spin_unlock_irq(&callback_lock);
11676 +       raw_spin_unlock_irq(&callback_lock);
11677  out_unlock:
11678         mutex_unlock(&cpuset_mutex);
11679         return 0;
11680 @@ -2065,7 +2065,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
11681  static void cpuset_bind(struct cgroup_subsys_state *root_css)
11682  {
11683         mutex_lock(&cpuset_mutex);
11684 -       spin_lock_irq(&callback_lock);
11685 +       raw_spin_lock_irq(&callback_lock);
11686
11687         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11688                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
11689 @@ -2076,7 +2076,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
11690                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
11691         }
11692
11693 -       spin_unlock_irq(&callback_lock);
11694 +       raw_spin_unlock_irq(&callback_lock);
11695         mutex_unlock(&cpuset_mutex);
11696  }
11697
11698 @@ -2177,12 +2177,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
11699  {
11700         bool is_empty;
11701
11702 -       spin_lock_irq(&callback_lock);
11703 +       raw_spin_lock_irq(&callback_lock);
11704         cpumask_copy(cs->cpus_allowed, new_cpus);
11705         cpumask_copy(cs->effective_cpus, new_cpus);
11706         cs->mems_allowed = *new_mems;
11707         cs->effective_mems = *new_mems;
11708 -       spin_unlock_irq(&callback_lock);
11709 +       raw_spin_unlock_irq(&callback_lock);
11710
11711         /*
11712          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
11713 @@ -2219,10 +2219,10 @@ hotplug_update_tasks(struct cpuset *cs,
11714         if (nodes_empty(*new_mems))
11715                 *new_mems = parent_cs(cs)->effective_mems;
11716
11717 -       spin_lock_irq(&callback_lock);
11718 +       raw_spin_lock_irq(&callback_lock);
11719         cpumask_copy(cs->effective_cpus, new_cpus);
11720         cs->effective_mems = *new_mems;
11721 -       spin_unlock_irq(&callback_lock);
11722 +       raw_spin_unlock_irq(&callback_lock);
11723
11724         if (cpus_updated)
11725                 update_tasks_cpumask(cs);
11726 @@ -2308,21 +2308,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
11727
11728         /* synchronize cpus_allowed to cpu_active_mask */
11729         if (cpus_updated) {
11730 -               spin_lock_irq(&callback_lock);
11731 +               raw_spin_lock_irq(&callback_lock);
11732                 if (!on_dfl)
11733                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
11734                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
11735 -               spin_unlock_irq(&callback_lock);
11736 +               raw_spin_unlock_irq(&callback_lock);
11737                 /* we don't mess with cpumasks of tasks in top_cpuset */
11738         }
11739
11740         /* synchronize mems_allowed to N_MEMORY */
11741         if (mems_updated) {
11742 -               spin_lock_irq(&callback_lock);
11743 +               raw_spin_lock_irq(&callback_lock);
11744                 if (!on_dfl)
11745                         top_cpuset.mems_allowed = new_mems;
11746                 top_cpuset.effective_mems = new_mems;
11747 -               spin_unlock_irq(&callback_lock);
11748 +               raw_spin_unlock_irq(&callback_lock);
11749                 update_tasks_nodemask(&top_cpuset);
11750         }
11751
11752 @@ -2420,11 +2420,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
11753  {
11754         unsigned long flags;
11755
11756 -       spin_lock_irqsave(&callback_lock, flags);
11757 +       raw_spin_lock_irqsave(&callback_lock, flags);
11758         rcu_read_lock();
11759         guarantee_online_cpus(task_cs(tsk), pmask);
11760         rcu_read_unlock();
11761 -       spin_unlock_irqrestore(&callback_lock, flags);
11762 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11763  }
11764
11765  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
11766 @@ -2472,11 +2472,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
11767         nodemask_t mask;
11768         unsigned long flags;
11769
11770 -       spin_lock_irqsave(&callback_lock, flags);
11771 +       raw_spin_lock_irqsave(&callback_lock, flags);
11772         rcu_read_lock();
11773         guarantee_online_mems(task_cs(tsk), &mask);
11774         rcu_read_unlock();
11775 -       spin_unlock_irqrestore(&callback_lock, flags);
11776 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11777
11778         return mask;
11779  }
11780 @@ -2568,14 +2568,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
11781                 return true;
11782
11783         /* Not hardwall and node outside mems_allowed: scan up cpusets */
11784 -       spin_lock_irqsave(&callback_lock, flags);
11785 +       raw_spin_lock_irqsave(&callback_lock, flags);
11786
11787         rcu_read_lock();
11788         cs = nearest_hardwall_ancestor(task_cs(current));
11789         allowed = node_isset(node, cs->mems_allowed);
11790         rcu_read_unlock();
11791
11792 -       spin_unlock_irqrestore(&callback_lock, flags);
11793 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11794         return allowed;
11795  }
11796
11797 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
11798 index fc1ef736253c..83c666537a7a 100644
11799 --- a/kernel/debug/kdb/kdb_io.c
11800 +++ b/kernel/debug/kdb/kdb_io.c
11801 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11802         int linecount;
11803         int colcount;
11804         int logging, saved_loglevel = 0;
11805 -       int saved_trap_printk;
11806         int got_printf_lock = 0;
11807         int retlen = 0;
11808         int fnd, len;
11809 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11810         unsigned long uninitialized_var(flags);
11811
11812         preempt_disable();
11813 -       saved_trap_printk = kdb_trap_printk;
11814 -       kdb_trap_printk = 0;
11815
11816         /* Serialize kdb_printf if multiple cpus try to write at once.
11817          * But if any cpu goes recursive in kdb, just print the output,
11818 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11819         } else {
11820                 __release(kdb_printf_lock);
11821         }
11822 -       kdb_trap_printk = saved_trap_printk;
11823         preempt_enable();
11824         return retlen;
11825  }
11826 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
11827         va_list ap;
11828         int r;
11829
11830 +       kdb_trap_printk++;
11831         va_start(ap, fmt);
11832         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
11833         va_end(ap);
11834 +       kdb_trap_printk--;
11835
11836         return r;
11837  }
11838 diff --git a/kernel/events/core.c b/kernel/events/core.c
11839 index 07c0dc806dfc..baf1a2867d74 100644
11840 --- a/kernel/events/core.c
11841 +++ b/kernel/events/core.c
11842 @@ -1050,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
11843         raw_spin_lock_init(&cpuctx->hrtimer_lock);
11844         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
11845         timer->function = perf_mux_hrtimer_handler;
11846 +       timer->irqsafe = 1;
11847  }
11848
11849  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
11850 @@ -8363,6 +8364,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
11851
11852         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11853         hwc->hrtimer.function = perf_swevent_hrtimer;
11854 +       hwc->hrtimer.irqsafe = 1;
11855
11856         /*
11857          * Since hrtimers have a fixed rate, we can do a static freq->period
11858 diff --git a/kernel/exit.c b/kernel/exit.c
11859 index 3076f3089919..fb2ebcf3ca7c 100644
11860 --- a/kernel/exit.c
11861 +++ b/kernel/exit.c
11862 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
11863          * Do this under ->siglock, we can race with another thread
11864          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
11865          */
11866 -       flush_sigqueue(&tsk->pending);
11867 +       flush_task_sigqueue(tsk);
11868         tsk->sighand = NULL;
11869         spin_unlock(&sighand->siglock);
11870
11871 diff --git a/kernel/fork.c b/kernel/fork.c
11872 index ba8a01564985..416d91e4af97 100644
11873 --- a/kernel/fork.c
11874 +++ b/kernel/fork.c
11875 @@ -76,6 +76,7 @@
11876  #include <linux/compiler.h>
11877  #include <linux/sysctl.h>
11878  #include <linux/kcov.h>
11879 +#include <linux/kprobes.h>
11880
11881  #include <asm/pgtable.h>
11882  #include <asm/pgalloc.h>
11883 @@ -376,13 +377,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
11884         if (atomic_dec_and_test(&sig->sigcnt))
11885                 free_signal_struct(sig);
11886  }
11887 -
11888 +#ifdef CONFIG_PREEMPT_RT_BASE
11889 +static
11890 +#endif
11891  void __put_task_struct(struct task_struct *tsk)
11892  {
11893         WARN_ON(!tsk->exit_state);
11894         WARN_ON(atomic_read(&tsk->usage));
11895         WARN_ON(tsk == current);
11896
11897 +       /*
11898 +        * Remove function-return probe instances associated with this
11899 +        * task and put them back on the free list.
11900 +        */
11901 +       kprobe_flush_task(tsk);
11902 +
11903 +       /* Task is done with its stack. */
11904 +       put_task_stack(tsk);
11905 +
11906         cgroup_free(tsk);
11907         task_numa_free(tsk);
11908         security_task_free(tsk);
11909 @@ -393,7 +405,18 @@ void __put_task_struct(struct task_struct *tsk)
11910         if (!profile_handoff_task(tsk))
11911                 free_task(tsk);
11912  }
11913 +#ifndef CONFIG_PREEMPT_RT_BASE
11914  EXPORT_SYMBOL_GPL(__put_task_struct);
11915 +#else
11916 +void __put_task_struct_cb(struct rcu_head *rhp)
11917 +{
11918 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
11919 +
11920 +       __put_task_struct(tsk);
11921 +
11922 +}
11923 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
11924 +#endif
11925
11926  void __init __weak arch_task_cache_init(void) { }
11927
11928 @@ -852,6 +875,19 @@ void __mmdrop(struct mm_struct *mm)
11929  }
11930  EXPORT_SYMBOL_GPL(__mmdrop);
11931
11932 +#ifdef CONFIG_PREEMPT_RT_BASE
11933 +/*
11934 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
11935 + * want another facility to make this work.
11936 + */
11937 +void __mmdrop_delayed(struct rcu_head *rhp)
11938 +{
11939 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
11940 +
11941 +       __mmdrop(mm);
11942 +}
11943 +#endif
11944 +
11945  static inline void __mmput(struct mm_struct *mm)
11946  {
11947         VM_BUG_ON(atomic_read(&mm->mm_users));
11948 @@ -1417,6 +1453,7 @@ static void rt_mutex_init_task(struct task_struct *p)
11949  #ifdef CONFIG_RT_MUTEXES
11950         p->pi_waiters = RB_ROOT;
11951         p->pi_waiters_leftmost = NULL;
11952 +       p->pi_top_task = NULL;
11953         p->pi_blocked_on = NULL;
11954  #endif
11955  }
11956 @@ -1426,6 +1463,9 @@ static void rt_mutex_init_task(struct task_struct *p)
11957   */
11958  static void posix_cpu_timers_init(struct task_struct *tsk)
11959  {
11960 +#ifdef CONFIG_PREEMPT_RT_BASE
11961 +       tsk->posix_timer_list = NULL;
11962 +#endif
11963         tsk->cputime_expires.prof_exp = 0;
11964         tsk->cputime_expires.virt_exp = 0;
11965         tsk->cputime_expires.sched_exp = 0;
11966 @@ -1552,6 +1592,7 @@ static __latent_entropy struct task_struct *copy_process(
11967         spin_lock_init(&p->alloc_lock);
11968
11969         init_sigpending(&p->pending);
11970 +       p->sigqueue_cache = NULL;
11971
11972         p->utime = p->stime = p->gtime = 0;
11973         p->utimescaled = p->stimescaled = 0;
11974 diff --git a/kernel/futex.c b/kernel/futex.c
11975 index 4c6b6e697b73..d9bab63efccb 100644
11976 --- a/kernel/futex.c
11977 +++ b/kernel/futex.c
11978 @@ -800,7 +800,7 @@ static int refill_pi_state_cache(void)
11979         return 0;
11980  }
11981
11982 -static struct futex_pi_state * alloc_pi_state(void)
11983 +static struct futex_pi_state *alloc_pi_state(void)
11984  {
11985         struct futex_pi_state *pi_state = current->pi_state_cache;
11986
11987 @@ -810,6 +810,11 @@ static struct futex_pi_state * alloc_pi_state(void)
11988         return pi_state;
11989  }
11990
11991 +static void get_pi_state(struct futex_pi_state *pi_state)
11992 +{
11993 +       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
11994 +}
11995 +
11996  /*
11997   * Drops a reference to the pi_state object and frees or caches it
11998   * when the last reference is gone.
11999 @@ -854,7 +859,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
12000   * Look up the task based on what TID userspace gave us.
12001   * We dont trust it.
12002   */
12003 -static struct task_struct * futex_find_get_task(pid_t pid)
12004 +static struct task_struct *futex_find_get_task(pid_t pid)
12005  {
12006         struct task_struct *p;
12007
12008 @@ -904,7 +909,9 @@ void exit_pi_state_list(struct task_struct *curr)
12009                  * task still owns the PI-state:
12010                  */
12011                 if (head->next != next) {
12012 +                       raw_spin_unlock_irq(&curr->pi_lock);
12013                         spin_unlock(&hb->lock);
12014 +                       raw_spin_lock_irq(&curr->pi_lock);
12015                         continue;
12016                 }
12017
12018 @@ -914,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
12019                 pi_state->owner = NULL;
12020                 raw_spin_unlock_irq(&curr->pi_lock);
12021
12022 -               rt_mutex_unlock(&pi_state->pi_mutex);
12023 -
12024 +               get_pi_state(pi_state);
12025                 spin_unlock(&hb->lock);
12026
12027 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12028 +               put_pi_state(pi_state);
12029 +
12030                 raw_spin_lock_irq(&curr->pi_lock);
12031         }
12032         raw_spin_unlock_irq(&curr->pi_lock);
12033 @@ -971,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
12034   *
12035   * [10] There is no transient state which leaves owner and user space
12036   *     TID out of sync.
12037 + *
12038 + *
12039 + * Serialization and lifetime rules:
12040 + *
12041 + * hb->lock:
12042 + *
12043 + *     hb -> futex_q, relation
12044 + *     futex_q -> pi_state, relation
12045 + *
12046 + *     (cannot be raw because hb can contain arbitrary amount
12047 + *      of futex_q's)
12048 + *
12049 + * pi_mutex->wait_lock:
12050 + *
12051 + *     {uval, pi_state}
12052 + *
12053 + *     (and pi_mutex 'obviously')
12054 + *
12055 + * p->pi_lock:
12056 + *
12057 + *     p->pi_state_list -> pi_state->list, relation
12058 + *
12059 + * pi_state->refcount:
12060 + *
12061 + *     pi_state lifetime
12062 + *
12063 + *
12064 + * Lock order:
12065 + *
12066 + *   hb->lock
12067 + *     pi_mutex->wait_lock
12068 + *       p->pi_lock
12069 + *
12070   */
12071
12072  /*
12073 @@ -978,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
12074   * the pi_state against the user space value. If correct, attach to
12075   * it.
12076   */
12077 -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12078 +static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
12079 +                             struct futex_pi_state *pi_state,
12080                               struct futex_pi_state **ps)
12081  {
12082         pid_t pid = uval & FUTEX_TID_MASK;
12083 +       u32 uval2;
12084 +       int ret;
12085
12086         /*
12087          * Userspace might have messed up non-PI and PI futexes [3]
12088 @@ -989,9 +1034,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12089         if (unlikely(!pi_state))
12090                 return -EINVAL;
12091
12092 +       /*
12093 +        * We get here with hb->lock held, and having found a
12094 +        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
12095 +        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
12096 +        * which in turn means that futex_lock_pi() still has a reference on
12097 +        * our pi_state.
12098 +        *
12099 +        * The waiter holding a reference on @pi_state also protects against
12100 +        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
12101 +        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
12102 +        * free pi_state before we can take a reference ourselves.
12103 +        */
12104         WARN_ON(!atomic_read(&pi_state->refcount));
12105
12106         /*
12107 +        * Now that we have a pi_state, we can acquire wait_lock
12108 +        * and do the state validation.
12109 +        */
12110 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12111 +
12112 +       /*
12113 +        * Since {uval, pi_state} is serialized by wait_lock, and our current
12114 +        * uval was read without holding it, it can have changed. Verify it
12115 +        * still is what we expect it to be, otherwise retry the entire
12116 +        * operation.
12117 +        */
12118 +       if (get_futex_value_locked(&uval2, uaddr))
12119 +               goto out_efault;
12120 +
12121 +       if (uval != uval2)
12122 +               goto out_eagain;
12123 +
12124 +       /*
12125          * Handle the owner died case:
12126          */
12127         if (uval & FUTEX_OWNER_DIED) {
12128 @@ -1006,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12129                          * is not 0. Inconsistent state. [5]
12130                          */
12131                         if (pid)
12132 -                               return -EINVAL;
12133 +                               goto out_einval;
12134                         /*
12135                          * Take a ref on the state and return success. [4]
12136                          */
12137 -                       goto out_state;
12138 +                       goto out_attach;
12139                 }
12140
12141                 /*
12142 @@ -1022,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12143                  * Take a ref on the state and return success. [6]
12144                  */
12145                 if (!pid)
12146 -                       goto out_state;
12147 +                       goto out_attach;
12148         } else {
12149                 /*
12150                  * If the owner died bit is not set, then the pi_state
12151                  * must have an owner. [7]
12152                  */
12153                 if (!pi_state->owner)
12154 -                       return -EINVAL;
12155 +                       goto out_einval;
12156         }
12157
12158         /*
12159 @@ -1038,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
12160          * user space TID. [9/10]
12161          */
12162         if (pid != task_pid_vnr(pi_state->owner))
12163 -               return -EINVAL;
12164 -out_state:
12165 -       atomic_inc(&pi_state->refcount);
12166 +               goto out_einval;
12167 +
12168 +out_attach:
12169 +       get_pi_state(pi_state);
12170 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12171         *ps = pi_state;
12172         return 0;
12173 +
12174 +out_einval:
12175 +       ret = -EINVAL;
12176 +       goto out_error;
12177 +
12178 +out_eagain:
12179 +       ret = -EAGAIN;
12180 +       goto out_error;
12181 +
12182 +out_efault:
12183 +       ret = -EFAULT;
12184 +       goto out_error;
12185 +
12186 +out_error:
12187 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12188 +       return ret;
12189  }
12190
12191  /*
12192 @@ -1093,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
12193
12194         /*
12195          * No existing pi state. First waiter. [2]
12196 +        *
12197 +        * This creates pi_state, we have hb->lock held, this means nothing can
12198 +        * observe this state, wait_lock is irrelevant.
12199          */
12200         pi_state = alloc_pi_state();
12201
12202 @@ -1117,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
12203         return 0;
12204  }
12205
12206 -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
12207 +static int lookup_pi_state(u32 __user *uaddr, u32 uval,
12208 +                          struct futex_hash_bucket *hb,
12209                            union futex_key *key, struct futex_pi_state **ps)
12210  {
12211 -       struct futex_q *match = futex_top_waiter(hb, key);
12212 +       struct futex_q *top_waiter = futex_top_waiter(hb, key);
12213
12214         /*
12215          * If there is a waiter on that futex, validate it and
12216          * attach to the pi_state when the validation succeeds.
12217          */
12218 -       if (match)
12219 -               return attach_to_pi_state(uval, match->pi_state, ps);
12220 +       if (top_waiter)
12221 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12222
12223         /*
12224          * We are the first waiter - try to look up the owner based on
12225 @@ -1146,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
12226         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
12227                 return -EFAULT;
12228
12229 -       /*If user space value changed, let the caller retry */
12230 +       /* If user space value changed, let the caller retry */
12231         return curval != uval ? -EAGAIN : 0;
12232  }
12233
12234 @@ -1174,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12235                                 struct task_struct *task, int set_waiters)
12236  {
12237         u32 uval, newval, vpid = task_pid_vnr(task);
12238 -       struct futex_q *match;
12239 +       struct futex_q *top_waiter;
12240         int ret;
12241
12242         /*
12243 @@ -1200,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
12244          * Lookup existing state first. If it exists, try to attach to
12245          * its pi_state.
12246          */
12247 -       match = futex_top_waiter(hb, key);
12248 -       if (match)
12249 -               return attach_to_pi_state(uval, match->pi_state, ps);
12250 +       top_waiter = futex_top_waiter(hb, key);
12251 +       if (top_waiter)
12252 +               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
12253
12254         /*
12255          * No waiter and user TID is 0. We are here because the
12256 @@ -1283,50 +1380,45 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
12257         wake_q_add(wake_q, p);
12258         __unqueue_futex(q);
12259         /*
12260 -        * The waiting task can free the futex_q as soon as
12261 -        * q->lock_ptr = NULL is written, without taking any locks. A
12262 -        * memory barrier is required here to prevent the following
12263 -        * store to lock_ptr from getting ahead of the plist_del.
12264 +        * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
12265 +        * is written, without taking any locks. This is possible in the event
12266 +        * of a spurious wakeup, for example. A memory barrier is required here
12267 +        * to prevent the following store to lock_ptr from getting ahead of the
12268 +        * plist_del in __unqueue_futex().
12269          */
12270 -       smp_wmb();
12271 -       q->lock_ptr = NULL;
12272 +       smp_store_release(&q->lock_ptr, NULL);
12273  }
12274
12275 -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12276 -                        struct futex_hash_bucket *hb)
12277 +/*
12278 + * Caller must hold a reference on @pi_state.
12279 + */
12280 +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
12281  {
12282 -       struct task_struct *new_owner;
12283 -       struct futex_pi_state *pi_state = this->pi_state;
12284         u32 uninitialized_var(curval), newval;
12285 +       struct task_struct *new_owner;
12286 +       bool postunlock = false;
12287         WAKE_Q(wake_q);
12288 -       bool deboost;
12289 +       WAKE_Q(wake_sleeper_q);
12290         int ret = 0;
12291
12292 -       if (!pi_state)
12293 -               return -EINVAL;
12294 -
12295 -       /*
12296 -        * If current does not own the pi_state then the futex is
12297 -        * inconsistent and user space fiddled with the futex value.
12298 -        */
12299 -       if (pi_state->owner != current)
12300 -               return -EINVAL;
12301 -
12302 -       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12303         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
12304 +       if (WARN_ON_ONCE(!new_owner)) {
12305 +               /*
12306 +                * As per the comment in futex_unlock_pi() this should not happen.
12307 +                *
12308 +                * When this happens, give up our locks and try again, giving
12309 +                * the futex_lock_pi() instance time to complete, either by
12310 +                * waiting on the rtmutex or removing itself from the futex
12311 +                * queue.
12312 +                */
12313 +               ret = -EAGAIN;
12314 +               goto out_unlock;
12315 +       }
12316
12317         /*
12318 -        * It is possible that the next waiter (the one that brought
12319 -        * this owner to the kernel) timed out and is no longer
12320 -        * waiting on the lock.
12321 -        */
12322 -       if (!new_owner)
12323 -               new_owner = this->task;
12324 -
12325 -       /*
12326 -        * We pass it to the next owner. The WAITERS bit is always
12327 -        * kept enabled while there is PI state around. We cleanup the
12328 -        * owner died bit, because we are the owner.
12329 +        * We pass it to the next owner. The WAITERS bit is always kept
12330 +        * enabled while there is PI state around. We cleanup the owner
12331 +        * died bit, because we are the owner.
12332          */
12333         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
12334
12335 @@ -1335,6 +1427,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12336
12337         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
12338                 ret = -EFAULT;
12339 +
12340         } else if (curval != uval) {
12341                 /*
12342                  * If a unconditional UNLOCK_PI operation (user space did not
12343 @@ -1347,10 +1440,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12344                 else
12345                         ret = -EINVAL;
12346         }
12347 -       if (ret) {
12348 -               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12349 -               return ret;
12350 -       }
12351 +
12352 +       if (ret)
12353 +               goto out_unlock;
12354 +
12355 +       /*
12356 +        * This is a point of no return; once we modify the uval there is no
12357 +        * going back and subsequent operations must not fail.
12358 +        */
12359
12360         raw_spin_lock(&pi_state->owner->pi_lock);
12361         WARN_ON(list_empty(&pi_state->list));
12362 @@ -1363,22 +1460,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12363         pi_state->owner = new_owner;
12364         raw_spin_unlock(&new_owner->pi_lock);
12365
12366 +       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
12367 +                                            &wake_sleeper_q);
12368 +out_unlock:
12369         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12370
12371 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
12372 +       if (postunlock)
12373 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
12374
12375 -       /*
12376 -        * First unlock HB so the waiter does not spin on it once he got woken
12377 -        * up. Second wake up the waiter before the priority is adjusted. If we
12378 -        * deboost first (and lose our higher priority), then the task might get
12379 -        * scheduled away before the wake up can take place.
12380 -        */
12381 -       spin_unlock(&hb->lock);
12382 -       wake_up_q(&wake_q);
12383 -       if (deboost)
12384 -               rt_mutex_adjust_prio(current);
12385 -
12386 -       return 0;
12387 +       return ret;
12388  }
12389
12390  /*
12391 @@ -1824,7 +1914,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12392                          * If that call succeeds then we have pi_state and an
12393                          * initial refcount on it.
12394                          */
12395 -                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
12396 +                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
12397                 }
12398
12399                 switch (ret) {
12400 @@ -1907,7 +1997,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12401                          * refcount on the pi_state and store the pointer in
12402                          * the futex_q object of the waiter.
12403                          */
12404 -                       atomic_inc(&pi_state->refcount);
12405 +                       get_pi_state(pi_state);
12406                         this->pi_state = pi_state;
12407                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
12408                                                         this->rt_waiter,
12409 @@ -1924,6 +2014,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12410                                 requeue_pi_wake_futex(this, &key2, hb2);
12411                                 drop_count++;
12412                                 continue;
12413 +                       } else if (ret == -EAGAIN) {
12414 +                               /*
12415 +                                * Waiter was woken by timeout or
12416 +                                * signal and has set pi_blocked_on to
12417 +                                * PI_WAKEUP_INPROGRESS before we
12418 +                                * tried to enqueue it on the rtmutex.
12419 +                                */
12420 +                               this->pi_state = NULL;
12421 +                               put_pi_state(pi_state);
12422 +                               continue;
12423                         } else if (ret) {
12424                                 /*
12425                                  * rt_mutex_start_proxy_lock() detected a
12426 @@ -2007,20 +2107,7 @@ queue_unlock(struct futex_hash_bucket *hb)
12427         hb_waiters_dec(hb);
12428  }
12429
12430 -/**
12431 - * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12432 - * @q: The futex_q to enqueue
12433 - * @hb:        The destination hash bucket
12434 - *
12435 - * The hb->lock must be held by the caller, and is released here. A call to
12436 - * queue_me() is typically paired with exactly one call to unqueue_me().  The
12437 - * exceptions involve the PI related operations, which may use unqueue_me_pi()
12438 - * or nothing if the unqueue is done as part of the wake process and the unqueue
12439 - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12440 - * an example).
12441 - */
12442 -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12443 -       __releases(&hb->lock)
12444 +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12445  {
12446         int prio;
12447
12448 @@ -2037,6 +2124,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12449         plist_node_init(&q->list, prio);
12450         plist_add(&q->list, &hb->chain);
12451         q->task = current;
12452 +}
12453 +
12454 +/**
12455 + * queue_me() - Enqueue the futex_q on the futex_hash_bucket
12456 + * @q: The futex_q to enqueue
12457 + * @hb:        The destination hash bucket
12458 + *
12459 + * The hb->lock must be held by the caller, and is released here. A call to
12460 + * queue_me() is typically paired with exactly one call to unqueue_me().  The
12461 + * exceptions involve the PI related operations, which may use unqueue_me_pi()
12462 + * or nothing if the unqueue is done as part of the wake process and the unqueue
12463 + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
12464 + * an example).
12465 + */
12466 +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
12467 +       __releases(&hb->lock)
12468 +{
12469 +       __queue_me(q, hb);
12470         spin_unlock(&hb->lock);
12471  }
12472
12473 @@ -2123,10 +2228,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12474  {
12475         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
12476         struct futex_pi_state *pi_state = q->pi_state;
12477 -       struct task_struct *oldowner = pi_state->owner;
12478         u32 uval, uninitialized_var(curval), newval;
12479 +       struct task_struct *oldowner;
12480         int ret;
12481
12482 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12483 +
12484 +       oldowner = pi_state->owner;
12485         /* Owner died? */
12486         if (!pi_state->owner)
12487                 newtid |= FUTEX_OWNER_DIED;
12488 @@ -2134,7 +2242,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12489         /*
12490          * We are here either because we stole the rtmutex from the
12491          * previous highest priority waiter or we are the highest priority
12492 -        * waiter but failed to get the rtmutex the first time.
12493 +        * waiter but have failed to get the rtmutex the first time.
12494 +        *
12495          * We have to replace the newowner TID in the user space variable.
12496          * This must be atomic as we have to preserve the owner died bit here.
12497          *
12498 @@ -2142,17 +2251,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12499          * because we can fault here. Imagine swapped out pages or a fork
12500          * that marked all the anonymous memory readonly for cow.
12501          *
12502 -        * Modifying pi_state _before_ the user space value would
12503 -        * leave the pi_state in an inconsistent state when we fault
12504 -        * here, because we need to drop the hash bucket lock to
12505 -        * handle the fault. This might be observed in the PID check
12506 -        * in lookup_pi_state.
12507 +        * Modifying pi_state _before_ the user space value would leave the
12508 +        * pi_state in an inconsistent state when we fault here, because we
12509 +        * need to drop the locks to handle the fault. This might be observed
12510 +        * in the PID check in lookup_pi_state.
12511          */
12512  retry:
12513         if (get_futex_value_locked(&uval, uaddr))
12514                 goto handle_fault;
12515
12516 -       while (1) {
12517 +       for (;;) {
12518                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
12519
12520                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
12521 @@ -2167,47 +2275,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
12522          * itself.
12523          */
12524         if (pi_state->owner != NULL) {
12525 -               raw_spin_lock_irq(&pi_state->owner->pi_lock);
12526 +               raw_spin_lock(&pi_state->owner->pi_lock);
12527                 WARN_ON(list_empty(&pi_state->list));
12528                 list_del_init(&pi_state->list);
12529 -               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
12530 +               raw_spin_unlock(&pi_state->owner->pi_lock);
12531         }
12532
12533         pi_state->owner = newowner;
12534
12535 -       raw_spin_lock_irq(&newowner->pi_lock);
12536 +       raw_spin_lock(&newowner->pi_lock);
12537         WARN_ON(!list_empty(&pi_state->list));
12538         list_add(&pi_state->list, &newowner->pi_state_list);
12539 -       raw_spin_unlock_irq(&newowner->pi_lock);
12540 +       raw_spin_unlock(&newowner->pi_lock);
12541 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12542 +
12543         return 0;
12544
12545         /*
12546 -        * To handle the page fault we need to drop the hash bucket
12547 -        * lock here. That gives the other task (either the highest priority
12548 -        * waiter itself or the task which stole the rtmutex) the
12549 -        * chance to try the fixup of the pi_state. So once we are
12550 -        * back from handling the fault we need to check the pi_state
12551 -        * after reacquiring the hash bucket lock and before trying to
12552 -        * do another fixup. When the fixup has been done already we
12553 -        * simply return.
12554 +        * To handle the page fault we need to drop the locks here. That gives
12555 +        * the other task (either the highest priority waiter itself or the
12556 +        * task which stole the rtmutex) the chance to try the fixup of the
12557 +        * pi_state. So once we are back from handling the fault we need to
12558 +        * check the pi_state after reacquiring the locks and before trying to
12559 +        * do another fixup. When the fixup has been done already we simply
12560 +        * return.
12561 +        *
12562 +        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
12563 +        * drop hb->lock since the caller owns the hb -> futex_q relation.
12564 +        * Dropping the pi_mutex->wait_lock requires the state revalidate.
12565          */
12566  handle_fault:
12567 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12568         spin_unlock(q->lock_ptr);
12569
12570         ret = fault_in_user_writeable(uaddr);
12571
12572         spin_lock(q->lock_ptr);
12573 +       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12574
12575         /*
12576          * Check if someone else fixed it for us:
12577          */
12578 -       if (pi_state->owner != oldowner)
12579 -               return 0;
12580 +       if (pi_state->owner != oldowner) {
12581 +               ret = 0;
12582 +               goto out_unlock;
12583 +       }
12584
12585         if (ret)
12586 -               return ret;
12587 +               goto out_unlock;
12588
12589         goto retry;
12590 +
12591 +out_unlock:
12592 +       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12593 +       return ret;
12594  }
12595
12596  static long futex_wait_restart(struct restart_block *restart);
12597 @@ -2229,13 +2350,16 @@ static long futex_wait_restart(struct restart_block *restart);
12598   */
12599  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12600  {
12601 -       struct task_struct *owner;
12602         int ret = 0;
12603
12604         if (locked) {
12605                 /*
12606                  * Got the lock. We might not be the anticipated owner if we
12607                  * did a lock-steal - fix up the PI-state in that case:
12608 +                *
12609 +                * We can safely read pi_state->owner without holding wait_lock
12610 +                * because we now own the rt_mutex, only the owner will attempt
12611 +                * to change it.
12612                  */
12613                 if (q->pi_state->owner != current)
12614                         ret = fixup_pi_state_owner(uaddr, q, current);
12615 @@ -2243,43 +2367,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
12616         }
12617
12618         /*
12619 -        * Catch the rare case, where the lock was released when we were on the
12620 -        * way back before we locked the hash bucket.
12621 -        */
12622 -       if (q->pi_state->owner == current) {
12623 -               /*
12624 -                * Try to get the rt_mutex now. This might fail as some other
12625 -                * task acquired the rt_mutex after we removed ourself from the
12626 -                * rt_mutex waiters list.
12627 -                */
12628 -               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
12629 -                       locked = 1;
12630 -                       goto out;
12631 -               }
12632 -
12633 -               /*
12634 -                * pi_state is incorrect, some other task did a lock steal and
12635 -                * we returned due to timeout or signal without taking the
12636 -                * rt_mutex. Too late.
12637 -                */
12638 -               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
12639 -               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
12640 -               if (!owner)
12641 -                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
12642 -               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
12643 -               ret = fixup_pi_state_owner(uaddr, q, owner);
12644 -               goto out;
12645 -       }
12646 -
12647 -       /*
12648          * Paranoia check. If we did not take the lock, then we should not be
12649          * the owner of the rt_mutex.
12650          */
12651 -       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
12652 +       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
12653                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
12654                                 "pi-state %p\n", ret,
12655                                 q->pi_state->pi_mutex.owner,
12656                                 q->pi_state->owner);
12657 +       }
12658
12659  out:
12660         return ret ? ret : locked;
12661 @@ -2503,6 +2599,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12662                          ktime_t *time, int trylock)
12663  {
12664         struct hrtimer_sleeper timeout, *to = NULL;
12665 +       struct futex_pi_state *pi_state = NULL;
12666 +       struct rt_mutex_waiter rt_waiter;
12667         struct futex_hash_bucket *hb;
12668         struct futex_q q = futex_q_init;
12669         int res, ret;
12670 @@ -2555,25 +2653,77 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12671                 }
12672         }
12673
12674 +       WARN_ON(!q.pi_state);
12675 +
12676         /*
12677          * Only actually queue now that the atomic ops are done:
12678          */
12679 -       queue_me(&q, hb);
12680 +       __queue_me(&q, hb);
12681
12682 -       WARN_ON(!q.pi_state);
12683 -       /*
12684 -        * Block on the PI mutex:
12685 -        */
12686 -       if (!trylock) {
12687 -               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
12688 -       } else {
12689 -               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
12690 +       if (trylock) {
12691 +               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
12692                 /* Fixup the trylock return value: */
12693                 ret = ret ? 0 : -EWOULDBLOCK;
12694 +               goto no_block;
12695         }
12696
12697 +       rt_mutex_init_waiter(&rt_waiter, false);
12698 +
12699 +       /*
12700 +        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
12701 +        * hold it while doing rt_mutex_start_proxy(), because then it will
12702 +        * include hb->lock in the blocking chain, even through we'll not in
12703 +        * fact hold it while blocking. This will lead it to report -EDEADLK
12704 +        * and BUG when futex_unlock_pi() interleaves with this.
12705 +        *
12706 +        * Therefore acquire wait_lock while holding hb->lock, but drop the
12707 +        * latter before calling rt_mutex_start_proxy_lock(). This still fully
12708 +        * serializes against futex_unlock_pi() as that does the exact same
12709 +        * lock handoff sequence.
12710 +        */
12711 +       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
12712 +       /*
12713 +        * the migrate_disable() here disables migration in the in_atomic() fast
12714 +        * path which is enabled again in the following spin_unlock(). We have
12715 +        * one migrate_disable() pending in the slow-path which is reversed
12716 +        * after the raw_spin_unlock_irq() where we leave the atomic context.
12717 +        */
12718 +       migrate_disable();
12719 +
12720 +       spin_unlock(q.lock_ptr);
12721 +       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
12722 +       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
12723 +       migrate_enable();
12724 +
12725 +       if (ret) {
12726 +               if (ret == 1)
12727 +                       ret = 0;
12728 +
12729 +               spin_lock(q.lock_ptr);
12730 +               goto no_block;
12731 +       }
12732 +
12733 +
12734 +       if (unlikely(to))
12735 +               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
12736 +
12737 +       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
12738 +
12739         spin_lock(q.lock_ptr);
12740         /*
12741 +        * If we failed to acquire the lock (signal/timeout), we must
12742 +        * first acquire the hb->lock before removing the lock from the
12743 +        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
12744 +        * wait lists consistent.
12745 +        *
12746 +        * In particular; it is important that futex_unlock_pi() can not
12747 +        * observe this inconsistency.
12748 +        */
12749 +       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
12750 +               ret = 0;
12751 +
12752 +no_block:
12753 +       /*
12754          * Fixup the pi_state owner and possibly acquire the lock if we
12755          * haven't already.
12756          */
12757 @@ -2589,12 +2739,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12758          * If fixup_owner() faulted and was unable to handle the fault, unlock
12759          * it and return the fault to userspace.
12760          */
12761 -       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
12762 -               rt_mutex_unlock(&q.pi_state->pi_mutex);
12763 +       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
12764 +               pi_state = q.pi_state;
12765 +               get_pi_state(pi_state);
12766 +       }
12767
12768         /* Unqueue and drop the lock */
12769         unqueue_me_pi(&q);
12770
12771 +       if (pi_state) {
12772 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
12773 +               put_pi_state(pi_state);
12774 +       }
12775 +
12776         goto out_put_key;
12777
12778  out_unlock_put_key:
12779 @@ -2603,8 +2760,10 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
12780  out_put_key:
12781         put_futex_key(&q.key);
12782  out:
12783 -       if (to)
12784 +       if (to) {
12785 +               hrtimer_cancel(&to->timer);
12786                 destroy_hrtimer_on_stack(&to->timer);
12787 +       }
12788         return ret != -EINTR ? ret : -ERESTARTNOINTR;
12789
12790  uaddr_faulted:
12791 @@ -2631,7 +2790,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12792         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
12793         union futex_key key = FUTEX_KEY_INIT;
12794         struct futex_hash_bucket *hb;
12795 -       struct futex_q *match;
12796 +       struct futex_q *top_waiter;
12797         int ret;
12798
12799  retry:
12800 @@ -2655,12 +2814,48 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12801          * all and we at least want to know if user space fiddled
12802          * with the futex value instead of blindly unlocking.
12803          */
12804 -       match = futex_top_waiter(hb, &key);
12805 -       if (match) {
12806 -               ret = wake_futex_pi(uaddr, uval, match, hb);
12807 +       top_waiter = futex_top_waiter(hb, &key);
12808 +       if (top_waiter) {
12809 +               struct futex_pi_state *pi_state = top_waiter->pi_state;
12810 +
12811 +               ret = -EINVAL;
12812 +               if (!pi_state)
12813 +                       goto out_unlock;
12814 +
12815                 /*
12816 -                * In case of success wake_futex_pi dropped the hash
12817 -                * bucket lock.
12818 +                * If current does not own the pi_state then the futex is
12819 +                * inconsistent and user space fiddled with the futex value.
12820 +                */
12821 +               if (pi_state->owner != current)
12822 +                       goto out_unlock;
12823 +
12824 +               get_pi_state(pi_state);
12825 +               /*
12826 +                * By taking wait_lock while still holding hb->lock, we ensure
12827 +                * there is no point where we hold neither; and therefore
12828 +                * wake_futex_pi() must observe a state consistent with what we
12829 +                * observed.
12830 +                */
12831 +               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
12832 +               /*
12833 +                * Magic trickery for now to make the RT migrate disable
12834 +                * logic happy. The following spin_unlock() happens with
12835 +                * interrupts disabled so the internal migrate_enable()
12836 +                * won't undo the migrate_disable() which was issued when
12837 +                * locking hb->lock.
12838 +                */
12839 +               migrate_disable();
12840 +               spin_unlock(&hb->lock);
12841 +
12842 +               /* Drops pi_state->pi_mutex.wait_lock */
12843 +               ret = wake_futex_pi(uaddr, uval, pi_state);
12844 +
12845 +               migrate_enable();
12846 +
12847 +               put_pi_state(pi_state);
12848 +
12849 +               /*
12850 +                * Success, we're done! No tricky corner cases.
12851                  */
12852                 if (!ret)
12853                         goto out_putkey;
12854 @@ -2675,7 +2870,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12855                  * setting the FUTEX_WAITERS bit. Try again.
12856                  */
12857                 if (ret == -EAGAIN) {
12858 -                       spin_unlock(&hb->lock);
12859                         put_futex_key(&key);
12860                         goto retry;
12861                 }
12862 @@ -2683,7 +2877,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12863                  * wake_futex_pi has detected invalid state. Tell user
12864                  * space.
12865                  */
12866 -               goto out_unlock;
12867 +               goto out_putkey;
12868         }
12869
12870         /*
12871 @@ -2693,8 +2887,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12872          * preserve the WAITERS bit not the OWNER_DIED one. We are the
12873          * owner.
12874          */
12875 -       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
12876 +       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
12877 +               spin_unlock(&hb->lock);
12878                 goto pi_faulted;
12879 +       }
12880
12881         /*
12882          * If uval has changed, let user space handle it.
12883 @@ -2708,7 +2904,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
12884         return ret;
12885
12886  pi_faulted:
12887 -       spin_unlock(&hb->lock);
12888         put_futex_key(&key);
12889
12890         ret = fault_in_user_writeable(uaddr);
12891 @@ -2812,8 +3007,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12892                                  u32 __user *uaddr2)
12893  {
12894         struct hrtimer_sleeper timeout, *to = NULL;
12895 +       struct futex_pi_state *pi_state = NULL;
12896         struct rt_mutex_waiter rt_waiter;
12897 -       struct futex_hash_bucket *hb;
12898 +       struct futex_hash_bucket *hb, *hb2;
12899         union futex_key key2 = FUTEX_KEY_INIT;
12900         struct futex_q q = futex_q_init;
12901         int res, ret;
12902 @@ -2838,10 +3034,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12903          * The waiter is allocated on our stack, manipulated by the requeue
12904          * code while we sleep on uaddr.
12905          */
12906 -       debug_rt_mutex_init_waiter(&rt_waiter);
12907 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
12908 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
12909 -       rt_waiter.task = NULL;
12910 +       rt_mutex_init_waiter(&rt_waiter, false);
12911
12912         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
12913         if (unlikely(ret != 0))
12914 @@ -2872,20 +3065,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12915         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
12916         futex_wait_queue_me(hb, &q, to);
12917
12918 -       spin_lock(&hb->lock);
12919 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12920 -       spin_unlock(&hb->lock);
12921 -       if (ret)
12922 -               goto out_put_keys;
12923 +       /*
12924 +        * On RT we must avoid races with requeue and trying to block
12925 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
12926 +        * serializing access to pi_blocked_on with pi_lock.
12927 +        */
12928 +       raw_spin_lock_irq(&current->pi_lock);
12929 +       if (current->pi_blocked_on) {
12930 +               /*
12931 +                * We have been requeued or are in the process of
12932 +                * being requeued.
12933 +                */
12934 +               raw_spin_unlock_irq(&current->pi_lock);
12935 +       } else {
12936 +               /*
12937 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
12938 +                * prevents a concurrent requeue from moving us to the
12939 +                * uaddr2 rtmutex. After that we can safely acquire
12940 +                * (and possibly block on) hb->lock.
12941 +                */
12942 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
12943 +               raw_spin_unlock_irq(&current->pi_lock);
12944 +
12945 +               spin_lock(&hb->lock);
12946 +
12947 +               /*
12948 +                * Clean up pi_blocked_on. We might leak it otherwise
12949 +                * when we succeeded with the hb->lock in the fast
12950 +                * path.
12951 +                */
12952 +               raw_spin_lock_irq(&current->pi_lock);
12953 +               current->pi_blocked_on = NULL;
12954 +               raw_spin_unlock_irq(&current->pi_lock);
12955 +
12956 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12957 +               spin_unlock(&hb->lock);
12958 +               if (ret)
12959 +                       goto out_put_keys;
12960 +       }
12961
12962         /*
12963 -        * In order for us to be here, we know our q.key == key2, and since
12964 -        * we took the hb->lock above, we also know that futex_requeue() has
12965 -        * completed and we no longer have to concern ourselves with a wakeup
12966 -        * race with the atomic proxy lock acquisition by the requeue code. The
12967 -        * futex_requeue dropped our key1 reference and incremented our key2
12968 -        * reference count.
12969 +        * In order to be here, we have either been requeued, are in
12970 +        * the process of being requeued, or requeue successfully
12971 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
12972 +        * non-null above, we may be racing with a requeue.  Do not
12973 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
12974 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
12975 +        * reference and incremented our key2 reference count.
12976          */
12977 +       hb2 = hash_futex(&key2);
12978
12979         /* Check if the requeue code acquired the second futex for us. */
12980         if (!q.rt_waiter) {
12981 @@ -2894,16 +3122,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12982                  * did a lock-steal - fix up the PI-state in that case.
12983                  */
12984                 if (q.pi_state && (q.pi_state->owner != current)) {
12985 -                       spin_lock(q.lock_ptr);
12986 +                       spin_lock(&hb2->lock);
12987 +                       BUG_ON(&hb2->lock != q.lock_ptr);
12988                         ret = fixup_pi_state_owner(uaddr2, &q, current);
12989 -                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
12990 -                               rt_mutex_unlock(&q.pi_state->pi_mutex);
12991 +                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
12992 +                               pi_state = q.pi_state;
12993 +                               get_pi_state(pi_state);
12994 +                       }
12995                         /*
12996                          * Drop the reference to the pi state which
12997                          * the requeue_pi() code acquired for us.
12998                          */
12999                         put_pi_state(q.pi_state);
13000 -                       spin_unlock(q.lock_ptr);
13001 +                       spin_unlock(&hb2->lock);
13002                 }
13003         } else {
13004                 struct rt_mutex *pi_mutex;
13005 @@ -2915,10 +3146,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13006                  */
13007                 WARN_ON(!q.pi_state);
13008                 pi_mutex = &q.pi_state->pi_mutex;
13009 -               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
13010 -               debug_rt_mutex_free_waiter(&rt_waiter);
13011 +               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
13012
13013 -               spin_lock(q.lock_ptr);
13014 +               spin_lock(&hb2->lock);
13015 +               BUG_ON(&hb2->lock != q.lock_ptr);
13016 +               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
13017 +                       ret = 0;
13018 +
13019 +               debug_rt_mutex_free_waiter(&rt_waiter);
13020                 /*
13021                  * Fixup the pi_state owner and possibly acquire the lock if we
13022                  * haven't already.
13023 @@ -2936,13 +3171,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
13024                  * the fault, unlock the rt_mutex and return the fault to
13025                  * userspace.
13026                  */
13027 -               if (ret && rt_mutex_owner(pi_mutex) == current)
13028 -                       rt_mutex_unlock(pi_mutex);
13029 +               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
13030 +                       pi_state = q.pi_state;
13031 +                       get_pi_state(pi_state);
13032 +               }
13033
13034                 /* Unqueue and drop the lock. */
13035                 unqueue_me_pi(&q);
13036         }
13037
13038 +       if (pi_state) {
13039 +               rt_mutex_futex_unlock(&pi_state->pi_mutex);
13040 +               put_pi_state(pi_state);
13041 +       }
13042 +
13043         if (ret == -EINTR) {
13044                 /*
13045                  * We've already been requeued, but cannot restart by calling
13046 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
13047 index d3f24905852c..f87aa8fdcc51 100644
13048 --- a/kernel/irq/handle.c
13049 +++ b/kernel/irq/handle.c
13050 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
13051  {
13052         irqreturn_t retval;
13053         unsigned int flags = 0;
13054 +       struct pt_regs *regs = get_irq_regs();
13055 +       u64 ip = regs ? instruction_pointer(regs) : 0;
13056
13057         retval = __handle_irq_event_percpu(desc, &flags);
13058
13059 -       add_interrupt_randomness(desc->irq_data.irq, flags);
13060 +#ifdef CONFIG_PREEMPT_RT_FULL
13061 +       desc->random_ip = ip;
13062 +#else
13063 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
13064 +#endif
13065
13066         if (!noirqdebug)
13067                 note_interrupt(desc, retval);
13068 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
13069 index 6b669593e7eb..e357bf6c59d5 100644
13070 --- a/kernel/irq/manage.c
13071 +++ b/kernel/irq/manage.c
13072 @@ -22,6 +22,7 @@
13073  #include "internals.h"
13074
13075  #ifdef CONFIG_IRQ_FORCED_THREADING
13076 +# ifndef CONFIG_PREEMPT_RT_BASE
13077  __read_mostly bool force_irqthreads;
13078
13079  static int __init setup_forced_irqthreads(char *arg)
13080 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
13081         return 0;
13082  }
13083  early_param("threadirqs", setup_forced_irqthreads);
13084 +# endif
13085  #endif
13086
13087  static void __synchronize_hardirq(struct irq_desc *desc)
13088 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
13089
13090         if (desc->affinity_notify) {
13091                 kref_get(&desc->affinity_notify->kref);
13092 +
13093 +#ifdef CONFIG_PREEMPT_RT_BASE
13094 +               swork_queue(&desc->affinity_notify->swork);
13095 +#else
13096                 schedule_work(&desc->affinity_notify->work);
13097 +#endif
13098         }
13099         irqd_set(data, IRQD_AFFINITY_SET);
13100
13101 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
13102  }
13103  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
13104
13105 -static void irq_affinity_notify(struct work_struct *work)
13106 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
13107  {
13108 -       struct irq_affinity_notify *notify =
13109 -               container_of(work, struct irq_affinity_notify, work);
13110         struct irq_desc *desc = irq_to_desc(notify->irq);
13111         cpumask_var_t cpumask;
13112         unsigned long flags;
13113 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
13114         kref_put(&notify->kref, notify->release);
13115  }
13116
13117 +#ifdef CONFIG_PREEMPT_RT_BASE
13118 +static void init_helper_thread(void)
13119 +{
13120 +       static int init_sworker_once;
13121 +
13122 +       if (init_sworker_once)
13123 +               return;
13124 +       if (WARN_ON(swork_get()))
13125 +               return;
13126 +       init_sworker_once = 1;
13127 +}
13128 +
13129 +static void irq_affinity_notify(struct swork_event *swork)
13130 +{
13131 +       struct irq_affinity_notify *notify =
13132 +               container_of(swork, struct irq_affinity_notify, swork);
13133 +       _irq_affinity_notify(notify);
13134 +}
13135 +
13136 +#else
13137 +
13138 +static void irq_affinity_notify(struct work_struct *work)
13139 +{
13140 +       struct irq_affinity_notify *notify =
13141 +               container_of(work, struct irq_affinity_notify, work);
13142 +       _irq_affinity_notify(notify);
13143 +}
13144 +#endif
13145 +
13146  /**
13147   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
13148   *     @irq:           Interrupt for which to enable/disable notification
13149 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
13150         if (notify) {
13151                 notify->irq = irq;
13152                 kref_init(&notify->kref);
13153 +#ifdef CONFIG_PREEMPT_RT_BASE
13154 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
13155 +               init_helper_thread();
13156 +#else
13157                 INIT_WORK(&notify->work, irq_affinity_notify);
13158 +#endif
13159         }
13160
13161         raw_spin_lock_irqsave(&desc->lock, flags);
13162 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
13163         local_bh_disable();
13164         ret = action->thread_fn(action->irq, action->dev_id);
13165         irq_finalize_oneshot(desc, action);
13166 -       local_bh_enable();
13167 +       /*
13168 +        * Interrupts which have real time requirements can be set up
13169 +        * to avoid softirq processing in the thread handler. This is
13170 +        * safe as these interrupts do not raise soft interrupts.
13171 +        */
13172 +       if (irq_settings_no_softirq_call(desc))
13173 +               _local_bh_enable();
13174 +       else
13175 +               local_bh_enable();
13176         return ret;
13177  }
13178
13179 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
13180                 if (action_ret == IRQ_WAKE_THREAD)
13181                         irq_wake_secondary(desc, action);
13182
13183 +#ifdef CONFIG_PREEMPT_RT_FULL
13184 +               migrate_disable();
13185 +               add_interrupt_randomness(action->irq, 0,
13186 +                                desc->random_ip ^ (unsigned long) action);
13187 +               migrate_enable();
13188 +#endif
13189                 wake_threads_waitq(desc);
13190         }
13191
13192 @@ -1336,6 +1389,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
13193                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
13194                 }
13195
13196 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
13197 +                       irq_settings_set_no_softirq_call(desc);
13198 +
13199                 /* Set default affinity mask once everything is setup */
13200                 setup_affinity(desc, mask);
13201
13202 @@ -2061,7 +2117,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
13203   *     This call sets the internal irqchip state of an interrupt,
13204   *     depending on the value of @which.
13205   *
13206 - *     This function should be called with preemption disabled if the
13207 + *     This function should be called with migration disabled if the
13208   *     interrupt controller has per-cpu registers.
13209   */
13210  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
13211 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
13212 index 320579d89091..2df2d4445b1e 100644
13213 --- a/kernel/irq/settings.h
13214 +++ b/kernel/irq/settings.h
13215 @@ -16,6 +16,7 @@ enum {
13216         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
13217         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
13218         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
13219 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
13220         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
13221  };
13222
13223 @@ -30,6 +31,7 @@ enum {
13224  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
13225  #define IRQ_IS_POLLED          GOT_YOU_MORON
13226  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
13227 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
13228  #undef IRQF_MODIFY_MASK
13229  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
13230
13231 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
13232         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
13233  }
13234
13235 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
13236 +{
13237 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
13238 +}
13239 +
13240 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
13241 +{
13242 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
13243 +}
13244 +
13245  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
13246  {
13247         return desc->status_use_accessors & _IRQ_PER_CPU;
13248 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
13249 index 5707f97a3e6a..73f38dc7a7fb 100644
13250 --- a/kernel/irq/spurious.c
13251 +++ b/kernel/irq/spurious.c
13252 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
13253
13254  static int __init irqfixup_setup(char *str)
13255  {
13256 +#ifdef CONFIG_PREEMPT_RT_BASE
13257 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13258 +       return 1;
13259 +#endif
13260         irqfixup = 1;
13261         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
13262         printk(KERN_WARNING "This may impact system performance.\n");
13263 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
13264
13265  static int __init irqpoll_setup(char *str)
13266  {
13267 +#ifdef CONFIG_PREEMPT_RT_BASE
13268 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13269 +       return 1;
13270 +#endif
13271         irqfixup = 2;
13272         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
13273                                 "enabled\n");
13274 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
13275 index bcf107ce0854..2899ba0d23d1 100644
13276 --- a/kernel/irq_work.c
13277 +++ b/kernel/irq_work.c
13278 @@ -17,6 +17,7 @@
13279  #include <linux/cpu.h>
13280  #include <linux/notifier.h>
13281  #include <linux/smp.h>
13282 +#include <linux/interrupt.h>
13283  #include <asm/processor.h>
13284
13285
13286 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
13287   */
13288  bool irq_work_queue_on(struct irq_work *work, int cpu)
13289  {
13290 +       struct llist_head *list;
13291 +
13292         /* All work should have been flushed before going offline */
13293         WARN_ON_ONCE(cpu_is_offline(cpu));
13294
13295 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
13296         if (!irq_work_claim(work))
13297                 return false;
13298
13299 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
13300 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
13301 +               list = &per_cpu(lazy_list, cpu);
13302 +       else
13303 +               list = &per_cpu(raised_list, cpu);
13304 +
13305 +       if (llist_add(&work->llnode, list))
13306                 arch_send_call_function_single_ipi(cpu);
13307
13308         return true;
13309 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
13310  /* Enqueue the irq work @work on the current CPU */
13311  bool irq_work_queue(struct irq_work *work)
13312  {
13313 +       struct llist_head *list;
13314 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
13315 +
13316         /* Only queue if not already pending */
13317         if (!irq_work_claim(work))
13318                 return false;
13319 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
13320         /* Queue the entry and raise the IPI if needed. */
13321         preempt_disable();
13322
13323 -       /* If the work is "lazy", handle it from next tick if any */
13324 -       if (work->flags & IRQ_WORK_LAZY) {
13325 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
13326 -                   tick_nohz_tick_stopped())
13327 -                       arch_irq_work_raise();
13328 -       } else {
13329 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
13330 +       lazy_work = work->flags & IRQ_WORK_LAZY;
13331 +
13332 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
13333 +               list = this_cpu_ptr(&lazy_list);
13334 +       else
13335 +               list = this_cpu_ptr(&raised_list);
13336 +
13337 +       if (llist_add(&work->llnode, list)) {
13338 +               if (!lazy_work || tick_nohz_tick_stopped())
13339                         arch_irq_work_raise();
13340         }
13341
13342 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
13343         raised = this_cpu_ptr(&raised_list);
13344         lazy = this_cpu_ptr(&lazy_list);
13345
13346 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
13347 -               if (llist_empty(lazy))
13348 -                       return false;
13349 +       if (llist_empty(raised) && llist_empty(lazy))
13350 +               return false;
13351
13352         /* All work should have been flushed before going offline */
13353         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
13354 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
13355         struct irq_work *work;
13356         struct llist_node *llnode;
13357
13358 -       BUG_ON(!irqs_disabled());
13359 +       BUG_ON_NONRT(!irqs_disabled());
13360
13361         if (llist_empty(list))
13362                 return;
13363 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
13364  void irq_work_run(void)
13365  {
13366         irq_work_run_list(this_cpu_ptr(&raised_list));
13367 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
13368 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
13369 +               /*
13370 +                * NOTE: we raise softirq via IPI for safety,
13371 +                * and execute in irq_work_tick() to move the
13372 +                * overhead from hard to soft irq context.
13373 +                */
13374 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
13375 +                       raise_softirq(TIMER_SOFTIRQ);
13376 +       } else
13377 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13378  }
13379  EXPORT_SYMBOL_GPL(irq_work_run);
13380
13381 @@ -179,8 +200,17 @@ void irq_work_tick(void)
13382
13383         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
13384                 irq_work_run_list(raised);
13385 +
13386 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
13387 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13388 +}
13389 +
13390 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
13391 +void irq_work_tick_soft(void)
13392 +{
13393         irq_work_run_list(this_cpu_ptr(&lazy_list));
13394  }
13395 +#endif
13396
13397  /*
13398   * Synchronize against the irq_work @entry, ensures the entry is not
13399 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
13400 index ee1bc1bb8feb..ddef07958840 100644
13401 --- a/kernel/ksysfs.c
13402 +++ b/kernel/ksysfs.c
13403 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
13404
13405  #endif /* CONFIG_KEXEC_CORE */
13406
13407 +#if defined(CONFIG_PREEMPT_RT_FULL)
13408 +static ssize_t  realtime_show(struct kobject *kobj,
13409 +                             struct kobj_attribute *attr, char *buf)
13410 +{
13411 +       return sprintf(buf, "%d\n", 1);
13412 +}
13413 +KERNEL_ATTR_RO(realtime);
13414 +#endif
13415 +
13416  /* whether file capabilities are enabled */
13417  static ssize_t fscaps_show(struct kobject *kobj,
13418                                   struct kobj_attribute *attr, char *buf)
13419 @@ -225,6 +234,9 @@ static struct attribute * kernel_attrs[] = {
13420         &rcu_expedited_attr.attr,
13421         &rcu_normal_attr.attr,
13422  #endif
13423 +#ifdef CONFIG_PREEMPT_RT_FULL
13424 +       &realtime_attr.attr,
13425 +#endif
13426         NULL
13427  };
13428
13429 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
13430 index 6f88e352cd4f..6ff9e8011dd0 100644
13431 --- a/kernel/locking/Makefile
13432 +++ b/kernel/locking/Makefile
13433 @@ -2,7 +2,7 @@
13434  # and is generally not a function of system call inputs.
13435  KCOV_INSTRUMENT                := n
13436
13437 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
13438 +obj-y += semaphore.o percpu-rwsem.o
13439
13440  ifdef CONFIG_FUNCTION_TRACER
13441  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
13442 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
13443  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
13444  endif
13445
13446 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13447 +obj-y += mutex.o
13448  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
13449 +endif
13450 +obj-y += rwsem.o
13451  obj-$(CONFIG_LOCKDEP) += lockdep.o
13452  ifeq ($(CONFIG_PROC_FS),y)
13453  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
13454 @@ -24,7 +28,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
13455  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
13456  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
13457  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
13458 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13459  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
13460  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
13461 +endif
13462 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o rwsem-rt.o
13463  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
13464  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
13465 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
13466 index 4d7ffc0a0d00..3d157b3128eb 100644
13467 --- a/kernel/locking/lockdep.c
13468 +++ b/kernel/locking/lockdep.c
13469 @@ -658,6 +658,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13470         struct lockdep_subclass_key *key;
13471         struct hlist_head *hash_head;
13472         struct lock_class *class;
13473 +       bool is_static = false;
13474
13475         if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
13476                 debug_locks_off();
13477 @@ -671,10 +672,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13478
13479         /*
13480          * Static locks do not have their class-keys yet - for them the key
13481 -        * is the lock object itself:
13482 +        * is the lock object itself. If the lock is in the per cpu area,
13483 +        * the canonical address of the lock (per cpu offset removed) is
13484 +        * used.
13485          */
13486 -       if (unlikely(!lock->key))
13487 -               lock->key = (void *)lock;
13488 +       if (unlikely(!lock->key)) {
13489 +               unsigned long can_addr, addr = (unsigned long)lock;
13490 +
13491 +               if (__is_kernel_percpu_address(addr, &can_addr))
13492 +                       lock->key = (void *)can_addr;
13493 +               else if (__is_module_percpu_address(addr, &can_addr))
13494 +                       lock->key = (void *)can_addr;
13495 +               else if (static_obj(lock))
13496 +                       lock->key = (void *)lock;
13497 +               else
13498 +                       return ERR_PTR(-EINVAL);
13499 +               is_static = true;
13500 +       }
13501
13502         /*
13503          * NOTE: the class-key must be unique. For dynamic locks, a static
13504 @@ -706,7 +720,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
13505                 }
13506         }
13507
13508 -       return NULL;
13509 +       return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
13510  }
13511
13512  /*
13513 @@ -724,19 +738,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
13514         DEBUG_LOCKS_WARN_ON(!irqs_disabled());
13515
13516         class = look_up_lock_class(lock, subclass);
13517 -       if (likely(class))
13518 +       if (likely(!IS_ERR_OR_NULL(class)))
13519                 goto out_set_class_cache;
13520
13521         /*
13522          * Debug-check: all keys must be persistent!
13523 -        */
13524 -       if (!static_obj(lock->key)) {
13525 +        */
13526 +       if (IS_ERR(class)) {
13527                 debug_locks_off();
13528                 printk("INFO: trying to register non-static key.\n");
13529                 printk("the code is fine but needs lockdep annotation.\n");
13530                 printk("turning off the locking correctness validator.\n");
13531                 dump_stack();
13532 -
13533                 return NULL;
13534         }
13535
13536 @@ -3410,7 +3423,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
13537                  * Clearly if the lock hasn't been acquired _ever_, we're not
13538                  * holding it either, so report failure.
13539                  */
13540 -               if (!class)
13541 +               if (IS_ERR_OR_NULL(class))
13542                         return 0;
13543
13544                 /*
13545 @@ -3689,6 +3702,7 @@ static void check_flags(unsigned long flags)
13546                 }
13547         }
13548
13549 +#ifndef CONFIG_PREEMPT_RT_FULL
13550         /*
13551          * We dont accurately track softirq state in e.g.
13552          * hardirq contexts (such as on 4KSTACKS), so only
13553 @@ -3703,6 +3717,7 @@ static void check_flags(unsigned long flags)
13554                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
13555                 }
13556         }
13557 +#endif
13558
13559         if (!debug_locks)
13560                 print_irqtrace_events(current);
13561 @@ -4159,7 +4174,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
13562                  * If the class exists we look it up and zap it:
13563                  */
13564                 class = look_up_lock_class(lock, j);
13565 -               if (class)
13566 +               if (!IS_ERR_OR_NULL(class))
13567                         zap_class(class);
13568         }
13569         /*
13570 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
13571 index f8c5af52a131..788068773e61 100644
13572 --- a/kernel/locking/locktorture.c
13573 +++ b/kernel/locking/locktorture.c
13574 @@ -26,7 +26,6 @@
13575  #include <linux/kthread.h>
13576  #include <linux/sched/rt.h>
13577  #include <linux/spinlock.h>
13578 -#include <linux/rwlock.h>
13579  #include <linux/mutex.h>
13580  #include <linux/rwsem.h>
13581  #include <linux/smp.h>
13582 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
13583 index ce182599cf2e..2ad3a1e8344c 100644
13584 --- a/kernel/locking/percpu-rwsem.c
13585 +++ b/kernel/locking/percpu-rwsem.c
13586 @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
13587         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
13588         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
13589         __init_rwsem(&sem->rw_sem, name, rwsem_key);
13590 -       init_waitqueue_head(&sem->writer);
13591 +       init_swait_queue_head(&sem->writer);
13592         sem->readers_block = 0;
13593         return 0;
13594  }
13595 @@ -103,7 +103,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
13596         __this_cpu_dec(*sem->read_count);
13597
13598         /* Prod writer to recheck readers_active */
13599 -       wake_up(&sem->writer);
13600 +       swake_up(&sem->writer);
13601  }
13602  EXPORT_SYMBOL_GPL(__percpu_up_read);
13603
13604 @@ -160,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
13605          */
13606
13607         /* Wait for all now active readers to complete. */
13608 -       wait_event(sem->writer, readers_active_check(sem));
13609 +       swait_event(sem->writer, readers_active_check(sem));
13610  }
13611  EXPORT_SYMBOL_GPL(percpu_down_write);
13612
13613 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
13614 new file mode 100644
13615 index 000000000000..6284e3b15091
13616 --- /dev/null
13617 +++ b/kernel/locking/rt.c
13618 @@ -0,0 +1,331 @@
13619 +/*
13620 + * kernel/rt.c
13621 + *
13622 + * Real-Time Preemption Support
13623 + *
13624 + * started by Ingo Molnar:
13625 + *
13626 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
13627 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13628 + *
13629 + * historic credit for proving that Linux spinlocks can be implemented via
13630 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
13631 + * and others) who prototyped it on 2.4 and did lots of comparative
13632 + * research and analysis; TimeSys, for proving that you can implement a
13633 + * fully preemptible kernel via the use of IRQ threading and mutexes;
13634 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
13635 + * right one; and to MontaVista, who ported pmutexes to 2.6.
13636 + *
13637 + * This code is a from-scratch implementation and is not based on pmutexes,
13638 + * but the idea of converting spinlocks to mutexes is used here too.
13639 + *
13640 + * lock debugging, locking tree, deadlock detection:
13641 + *
13642 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
13643 + *  Released under the General Public License (GPL).
13644 + *
13645 + * Includes portions of the generic R/W semaphore implementation from:
13646 + *
13647 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
13648 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
13649 + *  - Derived also from comments by Linus
13650 + *
13651 + * Pending ownership of locks and ownership stealing:
13652 + *
13653 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
13654 + *
13655 + *   (also by Steven Rostedt)
13656 + *    - Converted single pi_lock to individual task locks.
13657 + *
13658 + * By Esben Nielsen:
13659 + *    Doing priority inheritance with help of the scheduler.
13660 + *
13661 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13662 + *  - major rework based on Esben Nielsens initial patch
13663 + *  - replaced thread_info references by task_struct refs
13664 + *  - removed task->pending_owner dependency
13665 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
13666 + *    in the scheduler return path as discussed with Steven Rostedt
13667 + *
13668 + *  Copyright (C) 2006, Kihon Technologies Inc.
13669 + *    Steven Rostedt <rostedt@goodmis.org>
13670 + *  - debugged and patched Thomas Gleixner's rework.
13671 + *  - added back the cmpxchg to the rework.
13672 + *  - turned atomic require back on for SMP.
13673 + */
13674 +
13675 +#include <linux/spinlock.h>
13676 +#include <linux/rtmutex.h>
13677 +#include <linux/sched.h>
13678 +#include <linux/delay.h>
13679 +#include <linux/module.h>
13680 +#include <linux/kallsyms.h>
13681 +#include <linux/syscalls.h>
13682 +#include <linux/interrupt.h>
13683 +#include <linux/plist.h>
13684 +#include <linux/fs.h>
13685 +#include <linux/futex.h>
13686 +#include <linux/hrtimer.h>
13687 +
13688 +#include "rtmutex_common.h"
13689 +
13690 +/*
13691 + * struct mutex functions
13692 + */
13693 +void __mutex_do_init(struct mutex *mutex, const char *name,
13694 +                    struct lock_class_key *key)
13695 +{
13696 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13697 +       /*
13698 +        * Make sure we are not reinitializing a held lock:
13699 +        */
13700 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
13701 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
13702 +#endif
13703 +       mutex->lock.save_state = 0;
13704 +}
13705 +EXPORT_SYMBOL(__mutex_do_init);
13706 +
13707 +void __lockfunc _mutex_lock(struct mutex *lock)
13708 +{
13709 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13710 +       rt_mutex_lock(&lock->lock);
13711 +}
13712 +EXPORT_SYMBOL(_mutex_lock);
13713 +
13714 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
13715 +{
13716 +       int ret;
13717 +
13718 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13719 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13720 +       if (ret)
13721 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13722 +       return ret;
13723 +}
13724 +EXPORT_SYMBOL(_mutex_lock_interruptible);
13725 +
13726 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
13727 +{
13728 +       int ret;
13729 +
13730 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13731 +       ret = rt_mutex_lock_killable(&lock->lock);
13732 +       if (ret)
13733 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13734 +       return ret;
13735 +}
13736 +EXPORT_SYMBOL(_mutex_lock_killable);
13737 +
13738 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13739 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
13740 +{
13741 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13742 +       rt_mutex_lock(&lock->lock);
13743 +}
13744 +EXPORT_SYMBOL(_mutex_lock_nested);
13745 +
13746 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
13747 +{
13748 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
13749 +       rt_mutex_lock(&lock->lock);
13750 +}
13751 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
13752 +
13753 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
13754 +{
13755 +       int ret;
13756 +
13757 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13758 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13759 +       if (ret)
13760 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13761 +       return ret;
13762 +}
13763 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
13764 +
13765 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
13766 +{
13767 +       int ret;
13768 +
13769 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
13770 +       ret = rt_mutex_lock_killable(&lock->lock);
13771 +       if (ret)
13772 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13773 +       return ret;
13774 +}
13775 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
13776 +#endif
13777 +
13778 +int __lockfunc _mutex_trylock(struct mutex *lock)
13779 +{
13780 +       int ret = rt_mutex_trylock(&lock->lock);
13781 +
13782 +       if (ret)
13783 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13784 +
13785 +       return ret;
13786 +}
13787 +EXPORT_SYMBOL(_mutex_trylock);
13788 +
13789 +void __lockfunc _mutex_unlock(struct mutex *lock)
13790 +{
13791 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
13792 +       rt_mutex_unlock(&lock->lock);
13793 +}
13794 +EXPORT_SYMBOL(_mutex_unlock);
13795 +
13796 +/*
13797 + * rwlock_t functions
13798 + */
13799 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
13800 +{
13801 +       int ret;
13802 +
13803 +       migrate_disable();
13804 +       ret = rt_mutex_trylock(&rwlock->lock);
13805 +       if (ret)
13806 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13807 +       else
13808 +               migrate_enable();
13809 +
13810 +       return ret;
13811 +}
13812 +EXPORT_SYMBOL(rt_write_trylock);
13813 +
13814 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
13815 +{
13816 +       int ret;
13817 +
13818 +       *flags = 0;
13819 +       ret = rt_write_trylock(rwlock);
13820 +       return ret;
13821 +}
13822 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
13823 +
13824 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
13825 +{
13826 +       struct rt_mutex *lock = &rwlock->lock;
13827 +       int ret = 1;
13828 +
13829 +       /*
13830 +        * recursive read locks succeed when current owns the lock,
13831 +        * but not when read_depth == 0 which means that the lock is
13832 +        * write locked.
13833 +        */
13834 +       if (rt_mutex_owner(lock) != current) {
13835 +               migrate_disable();
13836 +               ret = rt_mutex_trylock(lock);
13837 +               if (ret)
13838 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13839 +               else
13840 +                       migrate_enable();
13841 +
13842 +       } else if (!rwlock->read_depth) {
13843 +               ret = 0;
13844 +       }
13845 +
13846 +       if (ret)
13847 +               rwlock->read_depth++;
13848 +
13849 +       return ret;
13850 +}
13851 +EXPORT_SYMBOL(rt_read_trylock);
13852 +
13853 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
13854 +{
13855 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13856 +       __rt_spin_lock(&rwlock->lock);
13857 +}
13858 +EXPORT_SYMBOL(rt_write_lock);
13859 +
13860 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
13861 +{
13862 +       struct rt_mutex *lock = &rwlock->lock;
13863 +
13864 +
13865 +       /*
13866 +        * recursive read locks succeed when current owns the lock
13867 +        */
13868 +       if (rt_mutex_owner(lock) != current) {
13869 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13870 +               __rt_spin_lock(lock);
13871 +       }
13872 +       rwlock->read_depth++;
13873 +}
13874 +
13875 +EXPORT_SYMBOL(rt_read_lock);
13876 +
13877 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
13878 +{
13879 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13880 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13881 +       __rt_spin_unlock(&rwlock->lock);
13882 +       migrate_enable();
13883 +}
13884 +EXPORT_SYMBOL(rt_write_unlock);
13885 +
13886 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
13887 +{
13888 +       /* Release the lock only when read_depth is down to 0 */
13889 +       if (--rwlock->read_depth == 0) {
13890 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13891 +               __rt_spin_unlock(&rwlock->lock);
13892 +               migrate_enable();
13893 +       }
13894 +}
13895 +EXPORT_SYMBOL(rt_read_unlock);
13896 +
13897 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
13898 +{
13899 +       rt_write_lock(rwlock);
13900 +
13901 +       return 0;
13902 +}
13903 +EXPORT_SYMBOL(rt_write_lock_irqsave);
13904 +
13905 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
13906 +{
13907 +       rt_read_lock(rwlock);
13908 +
13909 +       return 0;
13910 +}
13911 +EXPORT_SYMBOL(rt_read_lock_irqsave);
13912 +
13913 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
13914 +{
13915 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13916 +       /*
13917 +        * Make sure we are not reinitializing a held lock:
13918 +        */
13919 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
13920 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
13921 +#endif
13922 +       rwlock->lock.save_state = 1;
13923 +       rwlock->read_depth = 0;
13924 +}
13925 +EXPORT_SYMBOL(__rt_rwlock_init);
13926 +
13927 +/**
13928 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
13929 + * @cnt: the atomic which we are to dec
13930 + * @lock: the mutex to return holding if we dec to 0
13931 + *
13932 + * return true and hold lock if we dec to 0, return false otherwise
13933 + */
13934 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
13935 +{
13936 +       /* dec if we can't possibly hit 0 */
13937 +       if (atomic_add_unless(cnt, -1, 1))
13938 +               return 0;
13939 +       /* we might hit 0, so take the lock */
13940 +       mutex_lock(lock);
13941 +       if (!atomic_dec_and_test(cnt)) {
13942 +               /* when we actually did the dec, we didn't hit 0 */
13943 +               mutex_unlock(lock);
13944 +               return 0;
13945 +       }
13946 +       /* we hit 0, and we hold the lock */
13947 +       return 1;
13948 +}
13949 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
13950 diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
13951 index 62b6cee8ea7f..0613c4b1d059 100644
13952 --- a/kernel/locking/rtmutex-debug.c
13953 +++ b/kernel/locking/rtmutex-debug.c
13954 @@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
13955         lock->name = name;
13956  }
13957
13958 -void
13959 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
13960 -{
13961 -}
13962 -
13963 -void rt_mutex_deadlock_account_unlock(struct task_struct *task)
13964 -{
13965 -}
13966 -
13967 diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
13968 index d0519c3432b6..b585af9a1b50 100644
13969 --- a/kernel/locking/rtmutex-debug.h
13970 +++ b/kernel/locking/rtmutex-debug.h
13971 @@ -9,9 +9,6 @@
13972   * This file contains macros used solely by rtmutex.c. Debug version.
13973   */
13974
13975 -extern void
13976 -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
13977 -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
13978  extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
13979  extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
13980  extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
13981 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
13982 index 2c49d76f96c3..eec63f064b3f 100644
13983 --- a/kernel/locking/rtmutex.c
13984 +++ b/kernel/locking/rtmutex.c
13985 @@ -7,6 +7,11 @@
13986   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13987   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
13988   *  Copyright (C) 2006 Esben Nielsen
13989 + *  Adaptive Spinlocks:
13990 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
13991 + *                                  and Peter Morreale,
13992 + * Adaptive Spinlocks simplification:
13993 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
13994   *
13995   *  See Documentation/locking/rt-mutex-design.txt for details.
13996   */
13997 @@ -16,6 +21,7 @@
13998  #include <linux/sched/rt.h>
13999  #include <linux/sched/deadline.h>
14000  #include <linux/timer.h>
14001 +#include <linux/ww_mutex.h>
14002
14003  #include "rtmutex_common.h"
14004
14005 @@ -133,6 +139,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
14006                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
14007  }
14008
14009 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
14010 +{
14011 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
14012 +               waiter != PI_REQUEUE_INPROGRESS;
14013 +}
14014 +
14015  /*
14016   * We can speed up the acquire/release, if there's no debugging state to be
14017   * set up.
14018 @@ -222,12 +234,25 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
14019  }
14020  #endif
14021
14022 +#define STEAL_NORMAL  0
14023 +#define STEAL_LATERAL 1
14024 +/*
14025 + * Only use with rt_mutex_waiter_{less,equal}()
14026 + */
14027 +#define task_to_waiter(p)      \
14028 +       &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
14029 +
14030  static inline int
14031  rt_mutex_waiter_less(struct rt_mutex_waiter *left,
14032 -                    struct rt_mutex_waiter *right)
14033 +                    struct rt_mutex_waiter *right, int mode)
14034  {
14035 -       if (left->prio < right->prio)
14036 -               return 1;
14037 +       if (mode == STEAL_NORMAL) {
14038 +               if (left->prio < right->prio)
14039 +                       return 1;
14040 +       } else {
14041 +               if (left->prio <= right->prio)
14042 +                       return 1;
14043 +       }
14044
14045         /*
14046          * If both waiters have dl_prio(), we check the deadlines of the
14047 @@ -236,12 +261,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
14048          * then right waiter has a dl_prio() too.
14049          */
14050         if (dl_prio(left->prio))
14051 -               return dl_time_before(left->task->dl.deadline,
14052 -                                     right->task->dl.deadline);
14053 +               return dl_time_before(left->deadline, right->deadline);
14054
14055         return 0;
14056  }
14057
14058 +static inline int
14059 +rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
14060 +                     struct rt_mutex_waiter *right)
14061 +{
14062 +       if (left->prio != right->prio)
14063 +               return 0;
14064 +
14065 +       /*
14066 +        * If both waiters have dl_prio(), we check the deadlines of the
14067 +        * associated tasks.
14068 +        * If left waiter has a dl_prio(), and we didn't return 0 above,
14069 +        * then right waiter has a dl_prio() too.
14070 +        */
14071 +       if (dl_prio(left->prio))
14072 +               return left->deadline == right->deadline;
14073 +
14074 +       return 1;
14075 +}
14076 +
14077  static void
14078  rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
14079  {
14080 @@ -253,7 +296,7 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
14081         while (*link) {
14082                 parent = *link;
14083                 entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry);
14084 -               if (rt_mutex_waiter_less(waiter, entry)) {
14085 +               if (rt_mutex_waiter_less(waiter, entry, STEAL_NORMAL)) {
14086                         link = &parent->rb_left;
14087                 } else {
14088                         link = &parent->rb_right;
14089 @@ -292,7 +335,7 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
14090         while (*link) {
14091                 parent = *link;
14092                 entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry);
14093 -               if (rt_mutex_waiter_less(waiter, entry)) {
14094 +               if (rt_mutex_waiter_less(waiter, entry, STEAL_NORMAL)) {
14095                         link = &parent->rb_left;
14096                 } else {
14097                         link = &parent->rb_right;
14098 @@ -320,72 +363,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
14099         RB_CLEAR_NODE(&waiter->pi_tree_entry);
14100  }
14101
14102 -/*
14103 - * Calculate task priority from the waiter tree priority
14104 - *
14105 - * Return task->normal_prio when the waiter tree is empty or when
14106 - * the waiter is not allowed to do priority boosting
14107 - */
14108 -int rt_mutex_getprio(struct task_struct *task)
14109 +static void rt_mutex_adjust_prio(struct task_struct *p)
14110  {
14111 -       if (likely(!task_has_pi_waiters(task)))
14112 -               return task->normal_prio;
14113 +       struct task_struct *pi_task = NULL;
14114
14115 -       return min(task_top_pi_waiter(task)->prio,
14116 -                  task->normal_prio);
14117 -}
14118 +       lockdep_assert_held(&p->pi_lock);
14119
14120 -struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
14121 -{
14122 -       if (likely(!task_has_pi_waiters(task)))
14123 -               return NULL;
14124 +       if (task_has_pi_waiters(p))
14125 +               pi_task = task_top_pi_waiter(p)->task;
14126
14127 -       return task_top_pi_waiter(task)->task;
14128 -}
14129 -
14130 -/*
14131 - * Called by sched_setscheduler() to get the priority which will be
14132 - * effective after the change.
14133 - */
14134 -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
14135 -{
14136 -       if (!task_has_pi_waiters(task))
14137 -               return newprio;
14138 -
14139 -       if (task_top_pi_waiter(task)->task->prio <= newprio)
14140 -               return task_top_pi_waiter(task)->task->prio;
14141 -       return newprio;
14142 -}
14143 -
14144 -/*
14145 - * Adjust the priority of a task, after its pi_waiters got modified.
14146 - *
14147 - * This can be both boosting and unboosting. task->pi_lock must be held.
14148 - */
14149 -static void __rt_mutex_adjust_prio(struct task_struct *task)
14150 -{
14151 -       int prio = rt_mutex_getprio(task);
14152 -
14153 -       if (task->prio != prio || dl_prio(prio))
14154 -               rt_mutex_setprio(task, prio);
14155 -}
14156 -
14157 -/*
14158 - * Adjust task priority (undo boosting). Called from the exit path of
14159 - * rt_mutex_slowunlock() and rt_mutex_slowlock().
14160 - *
14161 - * (Note: We do this outside of the protection of lock->wait_lock to
14162 - * allow the lock to be taken while or before we readjust the priority
14163 - * of task. We do not use the spin_xx_mutex() variants here as we are
14164 - * outside of the debug path.)
14165 - */
14166 -void rt_mutex_adjust_prio(struct task_struct *task)
14167 -{
14168 -       unsigned long flags;
14169 -
14170 -       raw_spin_lock_irqsave(&task->pi_lock, flags);
14171 -       __rt_mutex_adjust_prio(task);
14172 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14173 +       rt_mutex_setprio(p, pi_task);
14174  }
14175
14176  /*
14177 @@ -414,6 +401,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
14178         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
14179  }
14180
14181 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
14182 +{
14183 +       if (waiter->savestate)
14184 +               wake_up_lock_sleeper(waiter->task);
14185 +       else
14186 +               wake_up_process(waiter->task);
14187 +}
14188 +
14189  /*
14190   * Max number of times we'll walk the boosting chain:
14191   */
14192 @@ -421,7 +416,8 @@ int max_lock_depth = 1024;
14193
14194  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
14195  {
14196 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
14197 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
14198 +               p->pi_blocked_on->lock : NULL;
14199  }
14200
14201  /*
14202 @@ -557,7 +553,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14203          * reached or the state of the chain has changed while we
14204          * dropped the locks.
14205          */
14206 -       if (!waiter)
14207 +       if (!rt_mutex_real_waiter(waiter))
14208                 goto out_unlock_pi;
14209
14210         /*
14211 @@ -608,7 +604,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14212          * enabled we continue, but stop the requeueing in the chain
14213          * walk.
14214          */
14215 -       if (waiter->prio == task->prio) {
14216 +       if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
14217                 if (!detect_deadlock)
14218                         goto out_unlock_pi;
14219                 else
14220 @@ -704,7 +700,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14221
14222         /* [7] Requeue the waiter in the lock waiter tree. */
14223         rt_mutex_dequeue(lock, waiter);
14224 +
14225 +       /*
14226 +        * Update the waiter prio fields now that we're dequeued.
14227 +        *
14228 +        * These values can have changed through either:
14229 +        *
14230 +        *   sys_sched_set_scheduler() / sys_sched_setattr()
14231 +        *
14232 +        * or
14233 +        *
14234 +        *   DL CBS enforcement advancing the effective deadline.
14235 +        *
14236 +        * Even though pi_waiters also uses these fields, and that tree is only
14237 +        * updated in [11], we can do this here, since we hold [L], which
14238 +        * serializes all pi_waiters access and rb_erase() does not care about
14239 +        * the values of the node being removed.
14240 +        */
14241         waiter->prio = task->prio;
14242 +       waiter->deadline = task->dl.deadline;
14243 +
14244         rt_mutex_enqueue(lock, waiter);
14245
14246         /* [8] Release the task */
14247 @@ -719,13 +734,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14248          * follow here. This is the end of the chain we are walking.
14249          */
14250         if (!rt_mutex_owner(lock)) {
14251 +               struct rt_mutex_waiter *lock_top_waiter;
14252 +
14253                 /*
14254                  * If the requeue [7] above changed the top waiter,
14255                  * then we need to wake the new top waiter up to try
14256                  * to get the lock.
14257                  */
14258 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
14259 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
14260 +               lock_top_waiter = rt_mutex_top_waiter(lock);
14261 +               if (prerequeue_top_waiter != lock_top_waiter)
14262 +                       rt_mutex_wake_waiter(lock_top_waiter);
14263                 raw_spin_unlock_irq(&lock->wait_lock);
14264                 return 0;
14265         }
14266 @@ -745,7 +763,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14267                  */
14268                 rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
14269                 rt_mutex_enqueue_pi(task, waiter);
14270 -               __rt_mutex_adjust_prio(task);
14271 +               rt_mutex_adjust_prio(task);
14272
14273         } else if (prerequeue_top_waiter == waiter) {
14274                 /*
14275 @@ -761,7 +779,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14276                 rt_mutex_dequeue_pi(task, waiter);
14277                 waiter = rt_mutex_top_waiter(lock);
14278                 rt_mutex_enqueue_pi(task, waiter);
14279 -               __rt_mutex_adjust_prio(task);
14280 +               rt_mutex_adjust_prio(task);
14281         } else {
14282                 /*
14283                  * Nothing changed. No need to do any priority
14284 @@ -818,6 +836,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14285         return ret;
14286  }
14287
14288 +
14289  /*
14290   * Try to take an rt-mutex
14291   *
14292 @@ -828,9 +847,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14293   * @waiter: The waiter that is queued to the lock's wait tree if the
14294   *         callsite called task_blocked_on_lock(), otherwise NULL
14295   */
14296 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14297 -                               struct rt_mutex_waiter *waiter)
14298 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
14299 +                                 struct task_struct *task,
14300 +                                 struct rt_mutex_waiter *waiter, int mode)
14301  {
14302 +       lockdep_assert_held(&lock->wait_lock);
14303 +
14304         /*
14305          * Before testing whether we can acquire @lock, we set the
14306          * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
14307 @@ -866,8 +888,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14308                  * If waiter is not the highest priority waiter of
14309                  * @lock, give up.
14310                  */
14311 -               if (waiter != rt_mutex_top_waiter(lock))
14312 +               if (waiter != rt_mutex_top_waiter(lock)) {
14313 +                       /* XXX rt_mutex_waiter_less() ? */
14314                         return 0;
14315 +               }
14316
14317                 /*
14318                  * We can acquire the lock. Remove the waiter from the
14319 @@ -885,14 +909,26 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14320                  * not need to be dequeued.
14321                  */
14322                 if (rt_mutex_has_waiters(lock)) {
14323 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
14324 +
14325 +                       if (task != pown)
14326 +                               return 0;
14327 +
14328 +                       /*
14329 +                        * Note that RT tasks are excluded from lateral-steals
14330 +                        * to prevent the introduction of an unbounded latency.
14331 +                        */
14332 +                       if (rt_task(task))
14333 +                               mode = STEAL_NORMAL;
14334                         /*
14335                          * If @task->prio is greater than or equal to
14336                          * the top waiter priority (kernel view),
14337                          * @task lost.
14338                          */
14339 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
14340 +                       if (!rt_mutex_waiter_less(task_to_waiter(task),
14341 +                                                 rt_mutex_top_waiter(lock),
14342 +                                                 mode))
14343                                 return 0;
14344 -
14345                         /*
14346                          * The current top waiter stays enqueued. We
14347                          * don't have to change anything in the lock
14348 @@ -936,11 +972,384 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14349          */
14350         rt_mutex_set_owner(lock, task);
14351
14352 -       rt_mutex_deadlock_account_lock(lock, task);
14353 -
14354         return 1;
14355  }
14356
14357 +#ifdef CONFIG_PREEMPT_RT_FULL
14358 +/*
14359 + * preemptible spin_lock functions:
14360 + */
14361 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
14362 +                                        void  (*slowfn)(struct rt_mutex *lock,
14363 +                                                        bool mg_off),
14364 +                                        bool do_mig_dis)
14365 +{
14366 +       might_sleep_no_state_check();
14367 +
14368 +       if (do_mig_dis)
14369 +               migrate_disable();
14370 +
14371 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14372 +               return;
14373 +       else
14374 +               slowfn(lock, do_mig_dis);
14375 +}
14376 +
14377 +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
14378 +                                          void  (*slowfn)(struct rt_mutex *lock))
14379 +{
14380 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
14381 +               return;
14382 +       else
14383 +               slowfn(lock);
14384 +}
14385 +#ifdef CONFIG_SMP
14386 +/*
14387 + * Note that owner is a speculative pointer and dereferencing relies
14388 + * on rcu_read_lock() and the check against the lock owner.
14389 + */
14390 +static int adaptive_wait(struct rt_mutex *lock,
14391 +                        struct task_struct *owner)
14392 +{
14393 +       int res = 0;
14394 +
14395 +       rcu_read_lock();
14396 +       for (;;) {
14397 +               if (owner != rt_mutex_owner(lock))
14398 +                       break;
14399 +               /*
14400 +                * Ensure that owner->on_cpu is dereferenced _after_
14401 +                * checking the above to be valid.
14402 +                */
14403 +               barrier();
14404 +               if (!owner->on_cpu) {
14405 +                       res = 1;
14406 +                       break;
14407 +               }
14408 +               cpu_relax();
14409 +       }
14410 +       rcu_read_unlock();
14411 +       return res;
14412 +}
14413 +#else
14414 +static int adaptive_wait(struct rt_mutex *lock,
14415 +                        struct task_struct *orig_owner)
14416 +{
14417 +       return 1;
14418 +}
14419 +#endif
14420 +
14421 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14422 +                                  struct rt_mutex_waiter *waiter,
14423 +                                  struct task_struct *task,
14424 +                                  enum rtmutex_chainwalk chwalk);
14425 +/*
14426 + * Slow path lock function spin_lock style: this variant is very
14427 + * careful not to miss any non-lock wakeups.
14428 + *
14429 + * We store the current state under p->pi_lock in p->saved_state and
14430 + * the try_to_wake_up() code handles this accordingly.
14431 + */
14432 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
14433 +                                                   bool mg_off)
14434 +{
14435 +       struct task_struct *lock_owner, *self = current;
14436 +       struct rt_mutex_waiter waiter, *top_waiter;
14437 +       unsigned long flags;
14438 +       int ret;
14439 +
14440 +       rt_mutex_init_waiter(&waiter, true);
14441 +
14442 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14443 +
14444 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
14445 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14446 +               return;
14447 +       }
14448 +
14449 +       BUG_ON(rt_mutex_owner(lock) == self);
14450 +
14451 +       /*
14452 +        * We save whatever state the task is in and we'll restore it
14453 +        * after acquiring the lock taking real wakeups into account
14454 +        * as well. We are serialized via pi_lock against wakeups. See
14455 +        * try_to_wake_up().
14456 +        */
14457 +       raw_spin_lock(&self->pi_lock);
14458 +       self->saved_state = self->state;
14459 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14460 +       raw_spin_unlock(&self->pi_lock);
14461 +
14462 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
14463 +       BUG_ON(ret);
14464 +
14465 +       for (;;) {
14466 +               /* Try to acquire the lock again. */
14467 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
14468 +                       break;
14469 +
14470 +               top_waiter = rt_mutex_top_waiter(lock);
14471 +               lock_owner = rt_mutex_owner(lock);
14472 +
14473 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14474 +
14475 +               debug_rt_mutex_print_deadlock(&waiter);
14476 +
14477 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
14478 +                       if (mg_off)
14479 +                               migrate_enable();
14480 +                       schedule();
14481 +                       if (mg_off)
14482 +                               migrate_disable();
14483 +               }
14484 +
14485 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
14486 +
14487 +               raw_spin_lock(&self->pi_lock);
14488 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14489 +               raw_spin_unlock(&self->pi_lock);
14490 +       }
14491 +
14492 +       /*
14493 +        * Restore the task state to current->saved_state. We set it
14494 +        * to the original state above and the try_to_wake_up() code
14495 +        * has possibly updated it when a real (non-rtmutex) wakeup
14496 +        * happened while we were blocked. Clear saved_state so
14497 +        * try_to_wakeup() does not get confused.
14498 +        */
14499 +       raw_spin_lock(&self->pi_lock);
14500 +       __set_current_state_no_track(self->saved_state);
14501 +       self->saved_state = TASK_RUNNING;
14502 +       raw_spin_unlock(&self->pi_lock);
14503 +
14504 +       /*
14505 +        * try_to_take_rt_mutex() sets the waiter bit
14506 +        * unconditionally. We might have to fix that up:
14507 +        */
14508 +       fixup_rt_mutex_waiters(lock);
14509 +
14510 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
14511 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
14512 +
14513 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14514 +
14515 +       debug_rt_mutex_free_waiter(&waiter);
14516 +}
14517 +
14518 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
14519 +                                            struct wake_q_head *wake_q,
14520 +                                            struct wake_q_head *wq_sleeper);
14521 +/*
14522 + * Slow path to release a rt_mutex spin_lock style
14523 + */
14524 +static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
14525 +{
14526 +       unsigned long flags;
14527 +       WAKE_Q(wake_q);
14528 +       WAKE_Q(wake_sleeper_q);
14529 +       bool postunlock;
14530 +
14531 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14532 +       postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q);
14533 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14534 +
14535 +       if (postunlock)
14536 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
14537 +}
14538 +
14539 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
14540 +{
14541 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
14542 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14543 +}
14544 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
14545 +
14546 +void __lockfunc rt_spin_lock(spinlock_t *lock)
14547 +{
14548 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
14549 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14550 +}
14551 +EXPORT_SYMBOL(rt_spin_lock);
14552 +
14553 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
14554 +{
14555 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
14556 +}
14557 +EXPORT_SYMBOL(__rt_spin_lock);
14558 +
14559 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
14560 +{
14561 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
14562 +}
14563 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
14564 +
14565 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14566 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
14567 +{
14568 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
14569 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
14570 +}
14571 +EXPORT_SYMBOL(rt_spin_lock_nested);
14572 +#endif
14573 +
14574 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
14575 +{
14576 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14577 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14578 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
14579 +}
14580 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
14581 +
14582 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
14583 +{
14584 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14585 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14586 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
14587 +       migrate_enable();
14588 +}
14589 +EXPORT_SYMBOL(rt_spin_unlock);
14590 +
14591 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
14592 +{
14593 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
14594 +}
14595 +EXPORT_SYMBOL(__rt_spin_unlock);
14596 +
14597 +/*
14598 + * Wait for the lock to get unlocked: instead of polling for an unlock
14599 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
14600 + * schedule if there's contention:
14601 + */
14602 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
14603 +{
14604 +       spin_lock(lock);
14605 +       spin_unlock(lock);
14606 +}
14607 +EXPORT_SYMBOL(rt_spin_unlock_wait);
14608 +
14609 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
14610 +{
14611 +       int ret;
14612 +
14613 +       ret = rt_mutex_trylock(&lock->lock);
14614 +       if (ret)
14615 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14616 +       return ret;
14617 +}
14618 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
14619 +
14620 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
14621 +{
14622 +       int ret;
14623 +
14624 +       migrate_disable();
14625 +       ret = rt_mutex_trylock(&lock->lock);
14626 +       if (ret)
14627 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14628 +       else
14629 +               migrate_enable();
14630 +       return ret;
14631 +}
14632 +EXPORT_SYMBOL(rt_spin_trylock);
14633 +
14634 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
14635 +{
14636 +       int ret;
14637 +
14638 +       local_bh_disable();
14639 +       ret = rt_mutex_trylock(&lock->lock);
14640 +       if (ret) {
14641 +               migrate_disable();
14642 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14643 +       } else
14644 +               local_bh_enable();
14645 +       return ret;
14646 +}
14647 +EXPORT_SYMBOL(rt_spin_trylock_bh);
14648 +
14649 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
14650 +{
14651 +       int ret;
14652 +
14653 +       *flags = 0;
14654 +       ret = rt_mutex_trylock(&lock->lock);
14655 +       if (ret) {
14656 +               migrate_disable();
14657 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14658 +       }
14659 +       return ret;
14660 +}
14661 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
14662 +
14663 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
14664 +{
14665 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
14666 +       if (atomic_add_unless(atomic, -1, 1))
14667 +               return 0;
14668 +       rt_spin_lock(lock);
14669 +       if (atomic_dec_and_test(atomic))
14670 +               return 1;
14671 +       rt_spin_unlock(lock);
14672 +       return 0;
14673 +}
14674 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
14675 +
14676 +       void
14677 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
14678 +{
14679 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14680 +       /*
14681 +        * Make sure we are not reinitializing a held lock:
14682 +        */
14683 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
14684 +       lockdep_init_map(&lock->dep_map, name, key, 0);
14685 +#endif
14686 +}
14687 +EXPORT_SYMBOL(__rt_spin_lock_init);
14688 +
14689 +#endif /* PREEMPT_RT_FULL */
14690 +
14691 +#ifdef CONFIG_PREEMPT_RT_FULL
14692 +       static inline int __sched
14693 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
14694 +{
14695 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14696 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
14697 +
14698 +       if (!hold_ctx)
14699 +               return 0;
14700 +
14701 +       if (unlikely(ctx == hold_ctx))
14702 +               return -EALREADY;
14703 +
14704 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
14705 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
14706 +#ifdef CONFIG_DEBUG_MUTEXES
14707 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
14708 +               ctx->contending_lock = ww;
14709 +#endif
14710 +               return -EDEADLK;
14711 +       }
14712 +
14713 +       return 0;
14714 +}
14715 +#else
14716 +       static inline int __sched
14717 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
14718 +{
14719 +       BUG();
14720 +       return 0;
14721 +}
14722 +
14723 +#endif
14724 +
14725 +static inline int
14726 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14727 +                    struct rt_mutex_waiter *waiter)
14728 +{
14729 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
14730 +}
14731 +
14732  /*
14733   * Task blocks on lock.
14734   *
14735 @@ -958,6 +1367,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14736         struct rt_mutex *next_lock;
14737         int chain_walk = 0, res;
14738
14739 +       lockdep_assert_held(&lock->wait_lock);
14740 +
14741         /*
14742          * Early deadlock detection. We really don't want the task to
14743          * enqueue on itself just to untangle the mess later. It's not
14744 @@ -971,10 +1382,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14745                 return -EDEADLK;
14746
14747         raw_spin_lock(&task->pi_lock);
14748 -       __rt_mutex_adjust_prio(task);
14749 +
14750 +       /*
14751 +        * In the case of futex requeue PI, this will be a proxy
14752 +        * lock. The task will wake unaware that it is enqueueed on
14753 +        * this lock. Avoid blocking on two locks and corrupting
14754 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
14755 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
14756 +        * before requeue (due to a signal or timeout). Do not enqueue
14757 +        * the task if PI_WAKEUP_INPROGRESS is set.
14758 +        */
14759 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
14760 +               raw_spin_unlock(&task->pi_lock);
14761 +               return -EAGAIN;
14762 +       }
14763 +
14764 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
14765 +
14766 +       rt_mutex_adjust_prio(task);
14767         waiter->task = task;
14768         waiter->lock = lock;
14769         waiter->prio = task->prio;
14770 +       waiter->deadline = task->dl.deadline;
14771
14772         /* Get the top priority waiter on the lock */
14773         if (rt_mutex_has_waiters(lock))
14774 @@ -993,8 +1422,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14775                 rt_mutex_dequeue_pi(owner, top_waiter);
14776                 rt_mutex_enqueue_pi(owner, waiter);
14777
14778 -               __rt_mutex_adjust_prio(owner);
14779 -               if (owner->pi_blocked_on)
14780 +               rt_mutex_adjust_prio(owner);
14781 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
14782                         chain_walk = 1;
14783         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
14784                 chain_walk = 1;
14785 @@ -1036,6 +1465,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14786   * Called with lock->wait_lock held and interrupts disabled.
14787   */
14788  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14789 +                                   struct wake_q_head *wake_sleeper_q,
14790                                     struct rt_mutex *lock)
14791  {
14792         struct rt_mutex_waiter *waiter;
14793 @@ -1045,12 +1475,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14794         waiter = rt_mutex_top_waiter(lock);
14795
14796         /*
14797 -        * Remove it from current->pi_waiters. We do not adjust a
14798 -        * possible priority boost right now. We execute wakeup in the
14799 -        * boosted mode and go back to normal after releasing
14800 -        * lock->wait_lock.
14801 +        * Remove it from current->pi_waiters and deboost.
14802 +        *
14803 +        * We must in fact deboost here in order to ensure we call
14804 +        * rt_mutex_setprio() to update p->pi_top_task before the
14805 +        * task unblocks.
14806          */
14807         rt_mutex_dequeue_pi(current, waiter);
14808 +       rt_mutex_adjust_prio(current);
14809
14810         /*
14811          * As we are waking up the top waiter, and the waiter stays
14812 @@ -1062,9 +1494,22 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14813          */
14814         lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
14815
14816 +       /*
14817 +        * We deboosted before waking the top waiter task such that we don't
14818 +        * run two tasks with the 'same' priority (and ensure the
14819 +        * p->pi_top_task pointer points to a blocked task). This however can
14820 +        * lead to priority inversion if we would get preempted after the
14821 +        * deboost but before waking our donor task, hence the preempt_disable()
14822 +        * before unlock.
14823 +        *
14824 +        * Pairs with preempt_enable() in rt_mutex_postunlock();
14825 +        */
14826 +       preempt_disable();
14827 +       if (waiter->savestate)
14828 +               wake_q_add(wake_sleeper_q, waiter->task);
14829 +       else
14830 +               wake_q_add(wake_q, waiter->task);
14831         raw_spin_unlock(&current->pi_lock);
14832 -
14833 -       wake_q_add(wake_q, waiter->task);
14834  }
14835
14836  /*
14837 @@ -1078,7 +1523,9 @@ static void remove_waiter(struct rt_mutex *lock,
14838  {
14839         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
14840         struct task_struct *owner = rt_mutex_owner(lock);
14841 -       struct rt_mutex *next_lock;
14842 +       struct rt_mutex *next_lock = NULL;
14843 +
14844 +       lockdep_assert_held(&lock->wait_lock);
14845
14846         raw_spin_lock(&current->pi_lock);
14847         rt_mutex_dequeue(lock, waiter);
14848 @@ -1099,10 +1546,11 @@ static void remove_waiter(struct rt_mutex *lock,
14849         if (rt_mutex_has_waiters(lock))
14850                 rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
14851
14852 -       __rt_mutex_adjust_prio(owner);
14853 +       rt_mutex_adjust_prio(owner);
14854
14855         /* Store the lock on which owner is blocked or NULL */
14856 -       next_lock = task_blocked_on_lock(owner);
14857 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
14858 +               next_lock = task_blocked_on_lock(owner);
14859
14860         raw_spin_unlock(&owner->pi_lock);
14861
14862 @@ -1138,21 +1586,30 @@ void rt_mutex_adjust_pi(struct task_struct *task)
14863         raw_spin_lock_irqsave(&task->pi_lock, flags);
14864
14865         waiter = task->pi_blocked_on;
14866 -       if (!waiter || (waiter->prio == task->prio &&
14867 -                       !dl_prio(task->prio))) {
14868 +       if (!rt_mutex_real_waiter(waiter) ||
14869 +           rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
14870                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14871                 return;
14872         }
14873         next_lock = waiter->lock;
14874 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14875
14876         /* gets dropped in rt_mutex_adjust_prio_chain()! */
14877         get_task_struct(task);
14878
14879 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14880         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
14881                                    next_lock, NULL, task);
14882  }
14883
14884 +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
14885 +{
14886 +       debug_rt_mutex_init_waiter(waiter);
14887 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
14888 +       RB_CLEAR_NODE(&waiter->tree_entry);
14889 +       waiter->task = NULL;
14890 +       waiter->savestate = savestate;
14891 +}
14892 +
14893  /**
14894   * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
14895   * @lock:               the rt_mutex to take
14896 @@ -1166,7 +1623,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
14897  static int __sched
14898  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14899                     struct hrtimer_sleeper *timeout,
14900 -                   struct rt_mutex_waiter *waiter)
14901 +                   struct rt_mutex_waiter *waiter,
14902 +                   struct ww_acquire_ctx *ww_ctx)
14903  {
14904         int ret = 0;
14905
14906 @@ -1175,16 +1633,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14907                 if (try_to_take_rt_mutex(lock, current, waiter))
14908                         break;
14909
14910 -               /*
14911 -                * TASK_INTERRUPTIBLE checks for signals and
14912 -                * timeout. Ignored otherwise.
14913 -                */
14914 -               if (unlikely(state == TASK_INTERRUPTIBLE)) {
14915 -                       /* Signal pending? */
14916 -                       if (signal_pending(current))
14917 -                               ret = -EINTR;
14918 -                       if (timeout && !timeout->task)
14919 -                               ret = -ETIMEDOUT;
14920 +               if (timeout && !timeout->task) {
14921 +                       ret = -ETIMEDOUT;
14922 +                       break;
14923 +               }
14924 +               if (signal_pending_state(state, current)) {
14925 +                       ret = -EINTR;
14926 +                       break;
14927 +               }
14928 +
14929 +               if (ww_ctx && ww_ctx->acquired > 0) {
14930 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
14931                         if (ret)
14932                                 break;
14933                 }
14934 @@ -1223,21 +1682,148 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
14935         }
14936  }
14937
14938 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
14939 +                                                  struct ww_acquire_ctx *ww_ctx)
14940 +{
14941 +#ifdef CONFIG_DEBUG_MUTEXES
14942 +       /*
14943 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
14944 +        * but released with a normal mutex_unlock in this call.
14945 +        *
14946 +        * This should never happen, always use ww_mutex_unlock.
14947 +        */
14948 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
14949 +
14950 +       /*
14951 +        * Not quite done after calling ww_acquire_done() ?
14952 +        */
14953 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
14954 +
14955 +       if (ww_ctx->contending_lock) {
14956 +               /*
14957 +                * After -EDEADLK you tried to
14958 +                * acquire a different ww_mutex? Bad!
14959 +                */
14960 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
14961 +
14962 +               /*
14963 +                * You called ww_mutex_lock after receiving -EDEADLK,
14964 +                * but 'forgot' to unlock everything else first?
14965 +                */
14966 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
14967 +               ww_ctx->contending_lock = NULL;
14968 +       }
14969 +
14970 +       /*
14971 +        * Naughty, using a different class will lead to undefined behavior!
14972 +        */
14973 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
14974 +#endif
14975 +       ww_ctx->acquired++;
14976 +}
14977 +
14978 +#ifdef CONFIG_PREEMPT_RT_FULL
14979 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14980 +                                 struct ww_acquire_ctx *ww_ctx)
14981 +{
14982 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14983 +       struct rt_mutex_waiter *waiter, *n;
14984 +
14985 +       /*
14986 +        * This branch gets optimized out for the common case,
14987 +        * and is only important for ww_mutex_lock.
14988 +        */
14989 +       ww_mutex_lock_acquired(ww, ww_ctx);
14990 +       ww->ctx = ww_ctx;
14991 +
14992 +       /*
14993 +        * Give any possible sleeping processes the chance to wake up,
14994 +        * so they can recheck if they have to back off.
14995 +        */
14996 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
14997 +                                            tree_entry) {
14998 +               /* XXX debug rt mutex waiter wakeup */
14999 +
15000 +               BUG_ON(waiter->lock != lock);
15001 +               rt_mutex_wake_waiter(waiter);
15002 +       }
15003 +}
15004 +
15005 +#else
15006 +
15007 +static void ww_mutex_account_lock(struct rt_mutex *lock,
15008 +                                 struct ww_acquire_ctx *ww_ctx)
15009 +{
15010 +       BUG();
15011 +}
15012 +#endif
15013 +
15014 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
15015 +                                    struct hrtimer_sleeper *timeout,
15016 +                                    enum rtmutex_chainwalk chwalk,
15017 +                                    struct ww_acquire_ctx *ww_ctx,
15018 +                                    struct rt_mutex_waiter *waiter)
15019 +{
15020 +       int ret;
15021 +
15022 +       /* Try to acquire the lock again: */
15023 +       if (try_to_take_rt_mutex(lock, current, NULL)) {
15024 +               if (ww_ctx)
15025 +                       ww_mutex_account_lock(lock, ww_ctx);
15026 +               return 0;
15027 +       }
15028 +
15029 +       set_current_state(state);
15030 +
15031 +       /* Setup the timer, when timeout != NULL */
15032 +       if (unlikely(timeout))
15033 +               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
15034 +
15035 +       ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk);
15036 +
15037 +       if (likely(!ret)) {
15038 +               /* sleep on the mutex */
15039 +               ret = __rt_mutex_slowlock(lock, state, timeout, waiter,
15040 +                                         ww_ctx);
15041 +       } else if (ww_ctx) {
15042 +               /* ww_mutex received EDEADLK, let it become EALREADY */
15043 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
15044 +               BUG_ON(!ret);
15045 +       }
15046 +
15047 +       if (unlikely(ret)) {
15048 +               __set_current_state(TASK_RUNNING);
15049 +               if (rt_mutex_has_waiters(lock))
15050 +                       remove_waiter(lock, waiter);
15051 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
15052 +               if (!ww_ctx)
15053 +                       rt_mutex_handle_deadlock(ret, chwalk, waiter);
15054 +       } else if (ww_ctx) {
15055 +               ww_mutex_account_lock(lock, ww_ctx);
15056 +       }
15057 +
15058 +       /*
15059 +        * try_to_take_rt_mutex() sets the waiter bit
15060 +        * unconditionally. We might have to fix that up.
15061 +        */
15062 +       fixup_rt_mutex_waiters(lock);
15063 +       return ret;
15064 +}
15065 +
15066  /*
15067   * Slow path lock function:
15068   */
15069  static int __sched
15070  rt_mutex_slowlock(struct rt_mutex *lock, int state,
15071                   struct hrtimer_sleeper *timeout,
15072 -                 enum rtmutex_chainwalk chwalk)
15073 +                 enum rtmutex_chainwalk chwalk,
15074 +                 struct ww_acquire_ctx *ww_ctx)
15075  {
15076         struct rt_mutex_waiter waiter;
15077         unsigned long flags;
15078         int ret = 0;
15079
15080 -       debug_rt_mutex_init_waiter(&waiter);
15081 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
15082 -       RB_CLEAR_NODE(&waiter.tree_entry);
15083 +       rt_mutex_init_waiter(&waiter, false);
15084
15085         /*
15086          * Technically we could use raw_spin_[un]lock_irq() here, but this can
15087 @@ -1249,36 +1835,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
15088          */
15089         raw_spin_lock_irqsave(&lock->wait_lock, flags);
15090
15091 -       /* Try to acquire the lock again: */
15092 -       if (try_to_take_rt_mutex(lock, current, NULL)) {
15093 -               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15094 -               return 0;
15095 -       }
15096 -
15097 -       set_current_state(state);
15098 -
15099 -       /* Setup the timer, when timeout != NULL */
15100 -       if (unlikely(timeout))
15101 -               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
15102 -
15103 -       ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
15104 -
15105 -       if (likely(!ret))
15106 -               /* sleep on the mutex */
15107 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
15108 -
15109 -       if (unlikely(ret)) {
15110 -               __set_current_state(TASK_RUNNING);
15111 -               if (rt_mutex_has_waiters(lock))
15112 -                       remove_waiter(lock, &waiter);
15113 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
15114 -       }
15115 -
15116 -       /*
15117 -        * try_to_take_rt_mutex() sets the waiter bit
15118 -        * unconditionally. We might have to fix that up.
15119 -        */
15120 -       fixup_rt_mutex_waiters(lock);
15121 +       ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx,
15122 +                                      &waiter);
15123
15124         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15125
15126 @@ -1328,10 +1886,12 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
15127
15128  /*
15129   * Slow path to release a rt-mutex.
15130 - * Return whether the current task needs to undo a potential priority boosting.
15131 + *
15132 + * Return whether the current task needs to call rt_mutex_postunlock().
15133   */
15134  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15135 -                                       struct wake_q_head *wake_q)
15136 +                                       struct wake_q_head *wake_q,
15137 +                                       struct wake_q_head *wake_sleeper_q)
15138  {
15139         unsigned long flags;
15140
15141 @@ -1340,8 +1900,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15142
15143         debug_rt_mutex_unlock(lock);
15144
15145 -       rt_mutex_deadlock_account_unlock(current);
15146 -
15147         /*
15148          * We must be careful here if the fast path is enabled. If we
15149          * have no waiters queued we cannot set owner to NULL here
15150 @@ -1387,12 +1945,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15151          *
15152          * Queue the next waiter for wakeup once we release the wait_lock.
15153          */
15154 -       mark_wakeup_next_waiter(wake_q, lock);
15155 -
15156 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
15157         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
15158
15159 -       /* check PI boosting */
15160 -       return true;
15161 +       return true; /* call rt_mutex_postunlock() */
15162  }
15163
15164  /*
15165 @@ -1403,63 +1959,85 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
15166   */
15167  static inline int
15168  rt_mutex_fastlock(struct rt_mutex *lock, int state,
15169 +                 struct ww_acquire_ctx *ww_ctx,
15170                   int (*slowfn)(struct rt_mutex *lock, int state,
15171                                 struct hrtimer_sleeper *timeout,
15172 -                               enum rtmutex_chainwalk chwalk))
15173 +                               enum rtmutex_chainwalk chwalk,
15174 +                               struct ww_acquire_ctx *ww_ctx))
15175  {
15176 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15177 -               rt_mutex_deadlock_account_lock(lock, current);
15178 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15179                 return 0;
15180 -       } else
15181 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
15182 +
15183 +       return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
15184  }
15185
15186  static inline int
15187  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
15188                         struct hrtimer_sleeper *timeout,
15189                         enum rtmutex_chainwalk chwalk,
15190 +                       struct ww_acquire_ctx *ww_ctx,
15191                         int (*slowfn)(struct rt_mutex *lock, int state,
15192                                       struct hrtimer_sleeper *timeout,
15193 -                                     enum rtmutex_chainwalk chwalk))
15194 +                                     enum rtmutex_chainwalk chwalk,
15195 +                                     struct ww_acquire_ctx *ww_ctx))
15196  {
15197         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
15198 -           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15199 -               rt_mutex_deadlock_account_lock(lock, current);
15200 +           likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15201                 return 0;
15202 -       } else
15203 -               return slowfn(lock, state, timeout, chwalk);
15204 +
15205 +       return slowfn(lock, state, timeout, chwalk, ww_ctx);
15206  }
15207
15208  static inline int
15209  rt_mutex_fasttrylock(struct rt_mutex *lock,
15210                      int (*slowfn)(struct rt_mutex *lock))
15211  {
15212 -       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
15213 -               rt_mutex_deadlock_account_lock(lock, current);
15214 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
15215                 return 1;
15216 -       }
15217 +
15218         return slowfn(lock);
15219  }
15220
15221 +/*
15222 + * Performs the wakeup of the the top-waiter and re-enables preemption.
15223 + */
15224 +void rt_mutex_postunlock(struct wake_q_head *wake_q,
15225 +                        struct wake_q_head *wq_sleeper)
15226 +{
15227 +       wake_up_q(wake_q);
15228 +       wake_up_q_sleeper(wq_sleeper);
15229 +
15230 +       /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
15231 +       preempt_enable();
15232 +}
15233 +
15234  static inline void
15235  rt_mutex_fastunlock(struct rt_mutex *lock,
15236                     bool (*slowfn)(struct rt_mutex *lock,
15237 -                                  struct wake_q_head *wqh))
15238 +                                  struct wake_q_head *wqh,
15239 +                                  struct wake_q_head *wq_sleeper))
15240  {
15241         WAKE_Q(wake_q);
15242 +       WAKE_Q(wake_sleeper_q);
15243
15244 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15245 -               rt_mutex_deadlock_account_unlock(current);
15246 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
15247 +               return;
15248
15249 -       } else {
15250 -               bool deboost = slowfn(lock, &wake_q);
15251 +       if (slowfn(lock, &wake_q,  &wake_sleeper_q))
15252 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15253 +}
15254
15255 -               wake_up_q(&wake_q);
15256 +/**
15257 + * rt_mutex_lock_state - lock a rt_mutex with a given state
15258 + *
15259 + * @lock:      The rt_mutex to be locked
15260 + * @state:     The state to set when blocking on the rt_mutex
15261 + */
15262 +int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state)
15263 +{
15264 +       might_sleep();
15265
15266 -               /* Undo pi boosting if necessary: */
15267 -               if (deboost)
15268 -                       rt_mutex_adjust_prio(current);
15269 -       }
15270 +       return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock);
15271  }
15272
15273  /**
15274 @@ -1469,15 +2047,13 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
15275   */
15276  void __sched rt_mutex_lock(struct rt_mutex *lock)
15277  {
15278 -       might_sleep();
15279 -
15280 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
15281 +       rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE);
15282  }
15283  EXPORT_SYMBOL_GPL(rt_mutex_lock);
15284
15285  /**
15286   * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
15287 - *
15288 + **
15289   * @lock:              the rt_mutex to be locked
15290   *
15291   * Returns:
15292 @@ -1486,23 +2062,32 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
15293   */
15294  int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
15295  {
15296 -       might_sleep();
15297 -
15298 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
15299 +       return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE);
15300  }
15301  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
15302
15303 -/*
15304 - * Futex variant with full deadlock detection.
15305 +/**
15306 + * rt_mutex_lock_killable - lock a rt_mutex killable
15307 + *
15308 + * @lock:              the rt_mutex to be locked
15309 + * @detect_deadlock:   deadlock detection on/off
15310 + *
15311 + * Returns:
15312 + *  0          on success
15313 + * -EINTR      when interrupted by a signal
15314   */
15315 -int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
15316 -                             struct hrtimer_sleeper *timeout)
15317 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
15318  {
15319 -       might_sleep();
15320 +       return rt_mutex_lock_state(lock, TASK_KILLABLE);
15321 +}
15322 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
15323
15324 -       return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15325 -                                      RT_MUTEX_FULL_CHAINWALK,
15326 -                                      rt_mutex_slowlock);
15327 +/*
15328 + * Futex variant, must not use fastpath.
15329 + */
15330 +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
15331 +{
15332 +       return rt_mutex_slowtrylock(lock);
15333  }
15334
15335  /**
15336 @@ -1525,6 +2110,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
15337
15338         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15339                                        RT_MUTEX_MIN_CHAINWALK,
15340 +                                      NULL,
15341                                        rt_mutex_slowlock);
15342  }
15343  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15344 @@ -1542,7 +2128,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15345   */
15346  int __sched rt_mutex_trylock(struct rt_mutex *lock)
15347  {
15348 +#ifdef CONFIG_PREEMPT_RT_FULL
15349 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
15350 +#else
15351         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
15352 +#endif
15353                 return 0;
15354
15355         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
15356 @@ -1560,21 +2150,53 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
15357  }
15358  EXPORT_SYMBOL_GPL(rt_mutex_unlock);
15359
15360 -/**
15361 - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
15362 - * @lock: the rt_mutex to be unlocked
15363 - *
15364 - * Returns: true/false indicating whether priority adjustment is
15365 - * required or not.
15366 - */
15367 -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
15368 -                                  struct wake_q_head *wqh)
15369 +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock,
15370 +                                            struct wake_q_head *wake_q,
15371 +                                            struct wake_q_head *wq_sleeper)
15372  {
15373 -       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15374 -               rt_mutex_deadlock_account_unlock(current);
15375 -               return false;
15376 +       lockdep_assert_held(&lock->wait_lock);
15377 +
15378 +       debug_rt_mutex_unlock(lock);
15379 +
15380 +       if (!rt_mutex_has_waiters(lock)) {
15381 +               lock->owner = NULL;
15382 +               return false; /* done */
15383         }
15384 -       return rt_mutex_slowunlock(lock, wqh);
15385 +
15386 +       /*
15387 +        * We've already deboosted, mark_wakeup_next_waiter() will
15388 +        * retain preempt_disabled when we drop the wait_lock, to
15389 +        * avoid inversion prior to the wakeup.  preempt_disable()
15390 +        * therein pairs with rt_mutex_postunlock().
15391 +        */
15392 +       mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
15393 +
15394 +       return true; /* call postunlock() */
15395 +}
15396 +
15397 +/**
15398 + * Futex variant, that since futex variants do not use the fast-path, can be
15399 + * simple and will not need to retry.
15400 + */
15401 +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
15402 +                                   struct wake_q_head *wake_q,
15403 +                                   struct wake_q_head *wq_sleeper)
15404 +{
15405 +       return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper);
15406 +}
15407 +
15408 +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
15409 +{
15410 +       WAKE_Q(wake_q);
15411 +       WAKE_Q(wake_sleeper_q);
15412 +       bool postunlock;
15413 +
15414 +       raw_spin_lock_irq(&lock->wait_lock);
15415 +       postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
15416 +       raw_spin_unlock_irq(&lock->wait_lock);
15417 +
15418 +       if (postunlock)
15419 +               rt_mutex_postunlock(&wake_q, &wake_sleeper_q);
15420  }
15421
15422  /**
15423 @@ -1607,13 +2229,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
15424  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
15425  {
15426         lock->owner = NULL;
15427 -       raw_spin_lock_init(&lock->wait_lock);
15428         lock->waiters = RB_ROOT;
15429         lock->waiters_leftmost = NULL;
15430
15431         debug_rt_mutex_init(lock, name);
15432  }
15433 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
15434 +EXPORT_SYMBOL(__rt_mutex_init);
15435
15436  /**
15437   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
15438 @@ -1628,10 +2249,9 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
15439  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
15440                                 struct task_struct *proxy_owner)
15441  {
15442 -       __rt_mutex_init(lock, NULL);
15443 +       rt_mutex_init(lock);
15444         debug_rt_mutex_proxy_lock(lock, proxy_owner);
15445         rt_mutex_set_owner(lock, proxy_owner);
15446 -       rt_mutex_deadlock_account_lock(lock, proxy_owner);
15447  }
15448
15449  /**
15450 @@ -1647,7 +2267,66 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
15451  {
15452         debug_rt_mutex_proxy_unlock(lock);
15453         rt_mutex_set_owner(lock, NULL);
15454 -       rt_mutex_deadlock_account_unlock(proxy_owner);
15455 +}
15456 +
15457 +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15458 +                             struct rt_mutex_waiter *waiter,
15459 +                             struct task_struct *task)
15460 +{
15461 +       int ret;
15462 +
15463 +       if (try_to_take_rt_mutex(lock, task, NULL))
15464 +               return 1;
15465 +
15466 +#ifdef CONFIG_PREEMPT_RT_FULL
15467 +       /*
15468 +        * In PREEMPT_RT there's an added race.
15469 +        * If the task, that we are about to requeue, times out,
15470 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
15471 +        * to skip this task. But right after the task sets
15472 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
15473 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
15474 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
15475 +        * lock that it blocks on. We *must not* place this task
15476 +        * on this proxy lock in that case.
15477 +        *
15478 +        * To prevent this race, we first take the task's pi_lock
15479 +        * and check if it has updated its pi_blocked_on. If it has,
15480 +        * we assume that it woke up and we return -EAGAIN.
15481 +        * Otherwise, we set the task's pi_blocked_on to
15482 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
15483 +        * it will know that we are in the process of requeuing it.
15484 +        */
15485 +       raw_spin_lock(&task->pi_lock);
15486 +       if (task->pi_blocked_on) {
15487 +               raw_spin_unlock(&task->pi_lock);
15488 +               raw_spin_unlock_irq(&lock->wait_lock);
15489 +               return -EAGAIN;
15490 +       }
15491 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
15492 +       raw_spin_unlock(&task->pi_lock);
15493 +#endif
15494 +
15495 +       /* We enforce deadlock detection for futexes */
15496 +       ret = task_blocks_on_rt_mutex(lock, waiter, task,
15497 +                                     RT_MUTEX_FULL_CHAINWALK);
15498 +
15499 +       if (ret && !rt_mutex_owner(lock)) {
15500 +               /*
15501 +                * Reset the return value. We might have
15502 +                * returned with -EDEADLK and the owner
15503 +                * released the lock while we were walking the
15504 +                * pi chain.  Let the waiter sort it out.
15505 +                */
15506 +               ret = 0;
15507 +       }
15508 +
15509 +       if (ret && rt_mutex_has_waiters(lock))
15510 +               remove_waiter(lock, waiter);
15511 +
15512 +       debug_rt_mutex_print_deadlock(waiter);
15513 +
15514 +       return ret;
15515  }
15516
15517  /**
15518 @@ -1670,33 +2349,9 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15519         int ret;
15520
15521         raw_spin_lock_irq(&lock->wait_lock);
15522 -
15523 -       if (try_to_take_rt_mutex(lock, task, NULL)) {
15524 -               raw_spin_unlock_irq(&lock->wait_lock);
15525 -               return 1;
15526 -       }
15527 -
15528 -       /* We enforce deadlock detection for futexes */
15529 -       ret = task_blocks_on_rt_mutex(lock, waiter, task,
15530 -                                     RT_MUTEX_FULL_CHAINWALK);
15531 -
15532 -       if (ret && !rt_mutex_owner(lock)) {
15533 -               /*
15534 -                * Reset the return value. We might have
15535 -                * returned with -EDEADLK and the owner
15536 -                * released the lock while we were walking the
15537 -                * pi chain.  Let the waiter sort it out.
15538 -                */
15539 -               ret = 0;
15540 -       }
15541 -
15542 -       if (unlikely(ret))
15543 -               remove_waiter(lock, waiter);
15544 -
15545 +       ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
15546         raw_spin_unlock_irq(&lock->wait_lock);
15547
15548 -       debug_rt_mutex_print_deadlock(waiter);
15549 -
15550         return ret;
15551  }
15552
15553 @@ -1721,24 +2376,27 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
15554  }
15555
15556  /**
15557 - * rt_mutex_finish_proxy_lock() - Complete lock acquisition
15558 + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
15559   * @lock:              the rt_mutex we were woken on
15560   * @to:                        the timeout, null if none. hrtimer should already have
15561   *                     been started.
15562   * @waiter:            the pre-initialized rt_mutex_waiter
15563   *
15564 - * Complete the lock acquisition started our behalf by another thread.
15565 + * Wait for the the lock acquisition started on our behalf by
15566 + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
15567 + * rt_mutex_cleanup_proxy_lock().
15568   *
15569   * Returns:
15570   *  0 - success
15571   * <0 - error, one of -EINTR, -ETIMEDOUT
15572   *
15573 - * Special API call for PI-futex requeue support
15574 + * Special API call for PI-futex support
15575   */
15576 -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15577 +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
15578                                struct hrtimer_sleeper *to,
15579                                struct rt_mutex_waiter *waiter)
15580  {
15581 +       struct task_struct *tsk = current;
15582         int ret;
15583
15584         raw_spin_lock_irq(&lock->wait_lock);
15585 @@ -1746,10 +2404,65 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15586         set_current_state(TASK_INTERRUPTIBLE);
15587
15588         /* sleep on the mutex */
15589 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
15590 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
15591
15592 -       if (unlikely(ret))
15593 +       /*
15594 +        * RT has a problem here when the wait got interrupted by a timeout
15595 +        * or a signal. task->pi_blocked_on is still set. The task must
15596 +        * acquire the hash bucket lock when returning from this function.
15597 +        *
15598 +        * If the hash bucket lock is contended then the
15599 +        * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in
15600 +        * task_blocks_on_rt_mutex() will trigger. This can be avoided by
15601 +        * clearing task->pi_blocked_on which removes the task from the
15602 +        * boosting chain of the rtmutex. That's correct because the task
15603 +        * is not longer blocked on it.
15604 +        */
15605 +       if (ret) {
15606 +               raw_spin_lock(&tsk->pi_lock);
15607 +               tsk->pi_blocked_on = NULL;
15608 +               raw_spin_unlock(&tsk->pi_lock);
15609 +       }
15610 +
15611 +       raw_spin_unlock_irq(&lock->wait_lock);
15612 +
15613 +       return ret;
15614 +}
15615 +
15616 +/**
15617 + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
15618 + * @lock:              the rt_mutex we were woken on
15619 + * @waiter:            the pre-initialized rt_mutex_waiter
15620 + *
15621 + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
15622 + *
15623 + * Unless we acquired the lock; we're still enqueued on the wait-list and can
15624 + * in fact still be granted ownership until we're removed. Therefore we can
15625 + * find we are in fact the owner and must disregard the
15626 + * rt_mutex_wait_proxy_lock() failure.
15627 + *
15628 + * Returns:
15629 + *  true  - did the cleanup, we done.
15630 + *  false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
15631 + *          caller should disregards its return value.
15632 + *
15633 + * Special API call for PI-futex support
15634 + */
15635 +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
15636 +                                struct rt_mutex_waiter *waiter)
15637 +{
15638 +       bool cleanup = false;
15639 +
15640 +       raw_spin_lock_irq(&lock->wait_lock);
15641 +       /*
15642 +        * Unless we're the owner; we're still enqueued on the wait_list.
15643 +        * So check if we became owner, if not, take us off the wait_list.
15644 +        */
15645 +       if (rt_mutex_owner(lock) != current) {
15646                 remove_waiter(lock, waiter);
15647 +               fixup_rt_mutex_waiters(lock);
15648 +               cleanup = true;
15649 +       }
15650
15651         /*
15652          * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
15653 @@ -1759,5 +2472,91 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15654
15655         raw_spin_unlock_irq(&lock->wait_lock);
15656
15657 +       return cleanup;
15658 +}
15659 +
15660 +static inline int
15661 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
15662 +{
15663 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
15664 +       unsigned tmp;
15665 +
15666 +       if (ctx->deadlock_inject_countdown-- == 0) {
15667 +               tmp = ctx->deadlock_inject_interval;
15668 +               if (tmp > UINT_MAX/4)
15669 +                       tmp = UINT_MAX;
15670 +               else
15671 +                       tmp = tmp*2 + tmp + tmp/2;
15672 +
15673 +               ctx->deadlock_inject_interval = tmp;
15674 +               ctx->deadlock_inject_countdown = tmp;
15675 +               ctx->contending_lock = lock;
15676 +
15677 +               ww_mutex_unlock(lock);
15678 +
15679 +               return -EDEADLK;
15680 +       }
15681 +#endif
15682 +
15683 +       return 0;
15684 +}
15685 +
15686 +#ifdef CONFIG_PREEMPT_RT_FULL
15687 +int __sched
15688 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
15689 +{
15690 +       int ret;
15691 +
15692 +       might_sleep();
15693 +
15694 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
15695 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
15696 +       if (ret)
15697 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
15698 +       else if (!ret && ww_ctx->acquired > 1)
15699 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
15700 +
15701         return ret;
15702  }
15703 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
15704 +
15705 +int __sched
15706 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
15707 +{
15708 +       int ret;
15709 +
15710 +       might_sleep();
15711 +
15712 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
15713 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
15714 +       if (ret)
15715 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
15716 +       else if (!ret && ww_ctx->acquired > 1)
15717 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
15718 +
15719 +       return ret;
15720 +}
15721 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
15722 +
15723 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
15724 +{
15725 +       int nest = !!lock->ctx;
15726 +
15727 +       /*
15728 +        * The unlocking fastpath is the 0->1 transition from 'locked'
15729 +        * into 'unlocked' state:
15730 +        */
15731 +       if (nest) {
15732 +#ifdef CONFIG_DEBUG_MUTEXES
15733 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
15734 +#endif
15735 +               if (lock->ctx->acquired > 0)
15736 +                       lock->ctx->acquired--;
15737 +               lock->ctx = NULL;
15738 +       }
15739 +
15740 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
15741 +       rt_mutex_unlock(&lock->base.lock);
15742 +}
15743 +EXPORT_SYMBOL(ww_mutex_unlock);
15744 +#endif
15745 diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
15746 index c4060584c407..6607802efa8b 100644
15747 --- a/kernel/locking/rtmutex.h
15748 +++ b/kernel/locking/rtmutex.h
15749 @@ -11,8 +11,6 @@
15750   */
15751
15752  #define rt_mutex_deadlock_check(l)                     (0)
15753 -#define rt_mutex_deadlock_account_lock(m, t)           do { } while (0)
15754 -#define rt_mutex_deadlock_account_unlock(l)            do { } while (0)
15755  #define debug_rt_mutex_init_waiter(w)                  do { } while (0)
15756  #define debug_rt_mutex_free_waiter(w)                  do { } while (0)
15757  #define debug_rt_mutex_lock(l)                         do { } while (0)
15758 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
15759 index e317e1cbb3eb..64d89d780059 100644
15760 --- a/kernel/locking/rtmutex_common.h
15761 +++ b/kernel/locking/rtmutex_common.h
15762 @@ -27,12 +27,14 @@ struct rt_mutex_waiter {
15763         struct rb_node          pi_tree_entry;
15764         struct task_struct      *task;
15765         struct rt_mutex         *lock;
15766 +       bool                    savestate;
15767  #ifdef CONFIG_DEBUG_RT_MUTEXES
15768         unsigned long           ip;
15769         struct pid              *deadlock_task_pid;
15770         struct rt_mutex         *deadlock_lock;
15771  #endif
15772         int prio;
15773 +       u64 deadline;
15774  };
15775
15776  /*
15777 @@ -98,21 +100,45 @@ enum rtmutex_chainwalk {
15778  /*
15779   * PI-futex support (proxy locking functions, etc.):
15780   */
15781 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
15782 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
15783 +
15784  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
15785  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
15786                                        struct task_struct *proxy_owner);
15787  extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
15788                                   struct task_struct *proxy_owner);
15789 +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
15790 +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15791 +                                    struct rt_mutex_waiter *waiter,
15792 +                                    struct task_struct *task);
15793  extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15794                                      struct rt_mutex_waiter *waiter,
15795                                      struct task_struct *task);
15796 -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15797 -                                     struct hrtimer_sleeper *to,
15798 -                                     struct rt_mutex_waiter *waiter);
15799 -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
15800 -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
15801 -                                 struct wake_q_head *wqh);
15802 -extern void rt_mutex_adjust_prio(struct task_struct *task);
15803 +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
15804 +                              struct hrtimer_sleeper *to,
15805 +                              struct rt_mutex_waiter *waiter);
15806 +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
15807 +                                struct rt_mutex_waiter *waiter);
15808 +
15809 +extern int rt_mutex_futex_trylock(struct rt_mutex *l);
15810 +
15811 +extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
15812 +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
15813 +                                struct wake_q_head *wqh,
15814 +                                struct wake_q_head *wq_sleeper);
15815 +
15816 +extern void rt_mutex_postunlock(struct wake_q_head *wake_q,
15817 +                               struct wake_q_head *wq_sleeper);
15818 +
15819 +/* RW semaphore special interface */
15820 +struct ww_acquire_ctx;
15821 +
15822 +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state,
15823 +                                    struct hrtimer_sleeper *timeout,
15824 +                                    enum rtmutex_chainwalk chwalk,
15825 +                                    struct ww_acquire_ctx *ww_ctx,
15826 +                                    struct rt_mutex_waiter *waiter);
15827
15828  #ifdef CONFIG_DEBUG_RT_MUTEXES
15829  # include "rtmutex-debug.h"
15830 diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c
15831 new file mode 100644
15832 index 000000000000..4a708ffcded6
15833 --- /dev/null
15834 +++ b/kernel/locking/rwsem-rt.c
15835 @@ -0,0 +1,268 @@
15836 +/*
15837 + */
15838 +#include <linux/rwsem.h>
15839 +#include <linux/sched.h>
15840 +#include <linux/export.h>
15841 +
15842 +#include "rtmutex_common.h"
15843 +
15844 +/*
15845 + * RT-specific reader/writer semaphores
15846 + *
15847 + * down_write()
15848 + *  1) Lock sem->rtmutex
15849 + *  2) Remove the reader BIAS to force readers into the slow path
15850 + *  3) Wait until all readers have left the critical region
15851 + *  4) Mark it write locked
15852 + *
15853 + * up_write()
15854 + *  1) Remove the write locked marker
15855 + *  2) Set the reader BIAS so readers can use the fast path again
15856 + *  3) Unlock sem->rtmutex to release blocked readers
15857 + *
15858 + * down_read()
15859 + *  1) Try fast path acquisition (reader BIAS is set)
15860 + *  2) Take sem->rtmutex.wait_lock which protects the writelocked flag
15861 + *  3) If !writelocked, acquire it for read
15862 + *  4) If writelocked, block on sem->rtmutex
15863 + *  5) unlock sem->rtmutex, goto 1)
15864 + *
15865 + * up_read()
15866 + *  1) Try fast path release (reader count != 1)
15867 + *  2) Wake the writer waiting in down_write()#3
15868 + *
15869 + * down_read()#3 has the consequence, that rw semaphores on RT are not writer
15870 + * fair, but writers, which should be avoided in RT tasks (think mmap_sem),
15871 + * are subject to the rtmutex priority/DL inheritance mechanism.
15872 + *
15873 + * It's possible to make the rw semaphores writer fair by keeping a list of
15874 + * active readers. A blocked writer would force all newly incoming readers to
15875 + * block on the rtmutex, but the rtmutex would have to be proxy locked for one
15876 + * reader after the other. We can't use multi-reader inheritance because there
15877 + * is no way to support that with SCHED_DEADLINE. Implementing the one by one
15878 + * reader boosting/handover mechanism is a major surgery for a very dubious
15879 + * value.
15880 + *
15881 + * The risk of writer starvation is there, but the pathological use cases
15882 + * which trigger it are not necessarily the typical RT workloads.
15883 + */
15884 +
15885 +void __rwsem_init(struct rw_semaphore *sem, const char *name,
15886 +                 struct lock_class_key *key)
15887 +{
15888 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
15889 +       /*
15890 +        * Make sure we are not reinitializing a held semaphore:
15891 +        */
15892 +       debug_check_no_locks_freed((void *)sem, sizeof(*sem));
15893 +       lockdep_init_map(&sem->dep_map, name, key, 0);
15894 +#endif
15895 +       atomic_set(&sem->readers, READER_BIAS);
15896 +}
15897 +EXPORT_SYMBOL(__rwsem_init);
15898 +
15899 +int __down_read_trylock(struct rw_semaphore *sem)
15900 +{
15901 +       int r, old;
15902 +
15903 +       /*
15904 +        * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is
15905 +        * set.
15906 +        */
15907 +       for (r = atomic_read(&sem->readers); r < 0;) {
15908 +               old = atomic_cmpxchg(&sem->readers, r, r + 1);
15909 +               if (likely(old == r))
15910 +                       return 1;
15911 +               r = old;
15912 +       }
15913 +       return 0;
15914 +}
15915 +
15916 +void __sched __down_read(struct rw_semaphore *sem)
15917 +{
15918 +       struct rt_mutex *m = &sem->rtmutex;
15919 +       struct rt_mutex_waiter waiter;
15920 +
15921 +       if (__down_read_trylock(sem))
15922 +               return;
15923 +
15924 +       might_sleep();
15925 +       raw_spin_lock_irq(&m->wait_lock);
15926 +       /*
15927 +        * Allow readers as long as the writer has not completely
15928 +        * acquired the semaphore for write.
15929 +        */
15930 +       if (atomic_read(&sem->readers) != WRITER_BIAS) {
15931 +               atomic_inc(&sem->readers);
15932 +               raw_spin_unlock_irq(&m->wait_lock);
15933 +               return;
15934 +       }
15935 +
15936 +       /*
15937 +        * Call into the slow lock path with the rtmutex->wait_lock
15938 +        * held, so this can't result in the following race:
15939 +        *
15940 +        * Reader1              Reader2         Writer
15941 +        *                      down_read()
15942 +        *                                      down_write()
15943 +        *                                      rtmutex_lock(m)
15944 +        *                                      swait()
15945 +        * down_read()
15946 +        * unlock(m->wait_lock)
15947 +        *                      up_read()
15948 +        *                      swake()
15949 +        *                                      lock(m->wait_lock)
15950 +        *                                      sem->writelocked=true
15951 +        *                                      unlock(m->wait_lock)
15952 +        *
15953 +        *                                      up_write()
15954 +        *                                      sem->writelocked=false
15955 +        *                                      rtmutex_unlock(m)
15956 +        *                      down_read()
15957 +        *                                      down_write()
15958 +        *                                      rtmutex_lock(m)
15959 +        *                                      swait()
15960 +        * rtmutex_lock(m)
15961 +        *
15962 +        * That would put Reader1 behind the writer waiting on
15963 +        * Reader2 to call up_read() which might be unbound.
15964 +        */
15965 +       rt_mutex_init_waiter(&waiter, false);
15966 +       rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL,
15967 +                                RT_MUTEX_MIN_CHAINWALK, NULL,
15968 +                                &waiter);
15969 +       /*
15970 +        * The slowlock() above is guaranteed to return with the rtmutex is
15971 +        * now held, so there can't be a writer active. Increment the reader
15972 +        * count and immediately drop the rtmutex again.
15973 +        */
15974 +       atomic_inc(&sem->readers);
15975 +       raw_spin_unlock_irq(&m->wait_lock);
15976 +       rt_mutex_unlock(m);
15977 +
15978 +       debug_rt_mutex_free_waiter(&waiter);
15979 +}
15980 +
15981 +void __up_read(struct rw_semaphore *sem)
15982 +{
15983 +       struct rt_mutex *m = &sem->rtmutex;
15984 +       struct task_struct *tsk;
15985 +
15986 +       /*
15987 +        * sem->readers can only hit 0 when a writer is waiting for the
15988 +        * active readers to leave the critical region.
15989 +        */
15990 +       if (!atomic_dec_and_test(&sem->readers))
15991 +               return;
15992 +
15993 +       might_sleep();
15994 +       raw_spin_lock_irq(&m->wait_lock);
15995 +       /*
15996 +        * Wake the writer, i.e. the rtmutex owner. It might release the
15997 +        * rtmutex concurrently in the fast path (due to a signal), but to
15998 +        * clean up the rwsem it needs to acquire m->wait_lock. The worst
15999 +        * case which can happen is a spurious wakeup.
16000 +        */
16001 +       tsk = rt_mutex_owner(m);
16002 +       if (tsk)
16003 +               wake_up_process(tsk);
16004 +
16005 +       raw_spin_unlock_irq(&m->wait_lock);
16006 +}
16007 +
16008 +static void __up_write_unlock(struct rw_semaphore *sem, int bias,
16009 +                             unsigned long flags)
16010 +{
16011 +       struct rt_mutex *m = &sem->rtmutex;
16012 +
16013 +       atomic_add(READER_BIAS - bias, &sem->readers);
16014 +       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16015 +       rt_mutex_unlock(m);
16016 +}
16017 +
16018 +static int __sched __down_write_common(struct rw_semaphore *sem, int state)
16019 +{
16020 +       struct rt_mutex *m = &sem->rtmutex;
16021 +       unsigned long flags;
16022 +
16023 +       /* Take the rtmutex as a first step */
16024 +       if (rt_mutex_lock_state(m, state))
16025 +               return -EINTR;
16026 +
16027 +       /* Force readers into slow path */
16028 +       atomic_sub(READER_BIAS, &sem->readers);
16029 +       might_sleep();
16030 +
16031 +       set_current_state(state);
16032 +       for (;;) {
16033 +               raw_spin_lock_irqsave(&m->wait_lock, flags);
16034 +               /* Have all readers left the critical region? */
16035 +               if (!atomic_read(&sem->readers)) {
16036 +                       atomic_set(&sem->readers, WRITER_BIAS);
16037 +                       __set_current_state(TASK_RUNNING);
16038 +                       raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16039 +                       return 0;
16040 +               }
16041 +
16042 +               if (signal_pending_state(state, current)) {
16043 +                       __set_current_state(TASK_RUNNING);
16044 +                       __up_write_unlock(sem, 0, flags);
16045 +                       return -EINTR;
16046 +               }
16047 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16048 +
16049 +               if (atomic_read(&sem->readers) != 0) {
16050 +                       schedule();
16051 +                       set_current_state(state);
16052 +               }
16053 +       }
16054 +}
16055 +
16056 +void __sched __down_write(struct rw_semaphore *sem)
16057 +{
16058 +       __down_write_common(sem, TASK_UNINTERRUPTIBLE);
16059 +}
16060 +
16061 +int __sched __down_write_killable(struct rw_semaphore *sem)
16062 +{
16063 +       return __down_write_common(sem, TASK_KILLABLE);
16064 +}
16065 +
16066 +int __down_write_trylock(struct rw_semaphore *sem)
16067 +{
16068 +       struct rt_mutex *m = &sem->rtmutex;
16069 +       unsigned long flags;
16070 +
16071 +       if (!rt_mutex_trylock(m))
16072 +               return 0;
16073 +
16074 +       atomic_sub(READER_BIAS, &sem->readers);
16075 +
16076 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16077 +       if (!atomic_read(&sem->readers)) {
16078 +               atomic_set(&sem->readers, WRITER_BIAS);
16079 +               raw_spin_unlock_irqrestore(&m->wait_lock, flags);
16080 +               return 1;
16081 +       }
16082 +       __up_write_unlock(sem, 0, flags);
16083 +       return 0;
16084 +}
16085 +
16086 +void __up_write(struct rw_semaphore *sem)
16087 +{
16088 +       struct rt_mutex *m = &sem->rtmutex;
16089 +       unsigned long flags;
16090 +
16091 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16092 +       __up_write_unlock(sem, WRITER_BIAS, flags);
16093 +}
16094 +
16095 +void __downgrade_write(struct rw_semaphore *sem)
16096 +{
16097 +       struct rt_mutex *m = &sem->rtmutex;
16098 +       unsigned long flags;
16099 +
16100 +       raw_spin_lock_irqsave(&m->wait_lock, flags);
16101 +       /* Release it and account current as reader */
16102 +       __up_write_unlock(sem, WRITER_BIAS - 1, flags);
16103 +}
16104 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
16105 index db3ccb1dd614..909779647bd1 100644
16106 --- a/kernel/locking/spinlock.c
16107 +++ b/kernel/locking/spinlock.c
16108 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
16109   *         __[spin|read|write]_lock_bh()
16110   */
16111  BUILD_LOCK_OPS(spin, raw_spinlock);
16112 +
16113 +#ifndef CONFIG_PREEMPT_RT_FULL
16114  BUILD_LOCK_OPS(read, rwlock);
16115  BUILD_LOCK_OPS(write, rwlock);
16116 +#endif
16117
16118  #endif
16119
16120 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
16121  EXPORT_SYMBOL(_raw_spin_unlock_bh);
16122  #endif
16123
16124 +#ifndef CONFIG_PREEMPT_RT_FULL
16125 +
16126  #ifndef CONFIG_INLINE_READ_TRYLOCK
16127  int __lockfunc _raw_read_trylock(rwlock_t *lock)
16128  {
16129 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
16130  EXPORT_SYMBOL(_raw_write_unlock_bh);
16131  #endif
16132
16133 +#endif /* !PREEMPT_RT_FULL */
16134 +
16135  #ifdef CONFIG_DEBUG_LOCK_ALLOC
16136
16137  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
16138 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
16139 index 0374a596cffa..94970338d518 100644
16140 --- a/kernel/locking/spinlock_debug.c
16141 +++ b/kernel/locking/spinlock_debug.c
16142 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
16143
16144  EXPORT_SYMBOL(__raw_spin_lock_init);
16145
16146 +#ifndef CONFIG_PREEMPT_RT_FULL
16147  void __rwlock_init(rwlock_t *lock, const char *name,
16148                    struct lock_class_key *key)
16149  {
16150 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
16151  }
16152
16153  EXPORT_SYMBOL(__rwlock_init);
16154 +#endif
16155
16156  static void spin_dump(raw_spinlock_t *lock, const char *msg)
16157  {
16158 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
16159         arch_spin_unlock(&lock->raw_lock);
16160  }
16161
16162 +#ifndef CONFIG_PREEMPT_RT_FULL
16163  static void rwlock_bug(rwlock_t *lock, const char *msg)
16164  {
16165         if (!debug_locks_off())
16166 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
16167         debug_write_unlock(lock);
16168         arch_write_unlock(&lock->raw_lock);
16169  }
16170 +
16171 +#endif
16172 diff --git a/kernel/module.c b/kernel/module.c
16173 index 0e54d5bf0097..f27764fbfa24 100644
16174 --- a/kernel/module.c
16175 +++ b/kernel/module.c
16176 @@ -660,16 +660,7 @@ static void percpu_modcopy(struct module *mod,
16177                 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
16178  }
16179
16180 -/**
16181 - * is_module_percpu_address - test whether address is from module static percpu
16182 - * @addr: address to test
16183 - *
16184 - * Test whether @addr belongs to module static percpu area.
16185 - *
16186 - * RETURNS:
16187 - * %true if @addr is from module static percpu area
16188 - */
16189 -bool is_module_percpu_address(unsigned long addr)
16190 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
16191  {
16192         struct module *mod;
16193         unsigned int cpu;
16194 @@ -683,9 +674,15 @@ bool is_module_percpu_address(unsigned long addr)
16195                         continue;
16196                 for_each_possible_cpu(cpu) {
16197                         void *start = per_cpu_ptr(mod->percpu, cpu);
16198 +                       void *va = (void *)addr;
16199
16200 -                       if ((void *)addr >= start &&
16201 -                           (void *)addr < start + mod->percpu_size) {
16202 +                       if (va >= start && va < start + mod->percpu_size) {
16203 +                               if (can_addr) {
16204 +                                       *can_addr = (unsigned long) (va - start);
16205 +                                       *can_addr += (unsigned long)
16206 +                                               per_cpu_ptr(mod->percpu,
16207 +                                                           get_boot_cpu_id());
16208 +                               }
16209                                 preempt_enable();
16210                                 return true;
16211                         }
16212 @@ -696,6 +693,20 @@ bool is_module_percpu_address(unsigned long addr)
16213         return false;
16214  }
16215
16216 +/**
16217 + * is_module_percpu_address - test whether address is from module static percpu
16218 + * @addr: address to test
16219 + *
16220 + * Test whether @addr belongs to module static percpu area.
16221 + *
16222 + * RETURNS:
16223 + * %true if @addr is from module static percpu area
16224 + */
16225 +bool is_module_percpu_address(unsigned long addr)
16226 +{
16227 +       return __is_module_percpu_address(addr, NULL);
16228 +}
16229 +
16230  #else /* ... !CONFIG_SMP */
16231
16232  static inline void __percpu *mod_percpu(struct module *mod)
16233 @@ -727,6 +738,11 @@ bool is_module_percpu_address(unsigned long addr)
16234         return false;
16235  }
16236
16237 +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
16238 +{
16239 +       return false;
16240 +}
16241 +
16242  #endif /* CONFIG_SMP */
16243
16244  #define MODINFO_ATTR(field)    \
16245 diff --git a/kernel/panic.c b/kernel/panic.c
16246 index e6480e20379e..7e9c1918a94e 100644
16247 --- a/kernel/panic.c
16248 +++ b/kernel/panic.c
16249 @@ -482,9 +482,11 @@ static u64 oops_id;
16250
16251  static int init_oops_id(void)
16252  {
16253 +#ifndef CONFIG_PREEMPT_RT_FULL
16254         if (!oops_id)
16255                 get_random_bytes(&oops_id, sizeof(oops_id));
16256         else
16257 +#endif
16258                 oops_id++;
16259
16260         return 0;
16261 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
16262 index b26dbc48c75b..968255f27a33 100644
16263 --- a/kernel/power/hibernate.c
16264 +++ b/kernel/power/hibernate.c
16265 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
16266
16267         local_irq_disable();
16268
16269 +       system_state = SYSTEM_SUSPEND;
16270 +
16271         error = syscore_suspend();
16272         if (error) {
16273                 printk(KERN_ERR "PM: Some system devices failed to power down, "
16274 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
16275         syscore_resume();
16276
16277   Enable_irqs:
16278 +       system_state = SYSTEM_RUNNING;
16279         local_irq_enable();
16280
16281   Enable_cpus:
16282 @@ -446,6 +449,7 @@ static int resume_target_kernel(bool platform_mode)
16283                 goto Enable_cpus;
16284
16285         local_irq_disable();
16286 +       system_state = SYSTEM_SUSPEND;
16287
16288         error = syscore_suspend();
16289         if (error)
16290 @@ -479,6 +483,7 @@ static int resume_target_kernel(bool platform_mode)
16291         syscore_resume();
16292
16293   Enable_irqs:
16294 +       system_state = SYSTEM_RUNNING;
16295         local_irq_enable();
16296
16297   Enable_cpus:
16298 @@ -564,6 +569,7 @@ int hibernation_platform_enter(void)
16299                 goto Enable_cpus;
16300
16301         local_irq_disable();
16302 +       system_state = SYSTEM_SUSPEND;
16303         syscore_suspend();
16304         if (pm_wakeup_pending()) {
16305                 error = -EAGAIN;
16306 @@ -576,6 +582,7 @@ int hibernation_platform_enter(void)
16307
16308   Power_up:
16309         syscore_resume();
16310 +       system_state = SYSTEM_RUNNING;
16311         local_irq_enable();
16312
16313   Enable_cpus:
16314 @@ -676,6 +683,10 @@ static int load_image_and_restore(void)
16315         return error;
16316  }
16317
16318 +#ifndef CONFIG_SUSPEND
16319 +bool pm_in_action;
16320 +#endif
16321 +
16322  /**
16323   * hibernate - Carry out system hibernation, including saving the image.
16324   */
16325 @@ -689,6 +700,8 @@ int hibernate(void)
16326                 return -EPERM;
16327         }
16328
16329 +       pm_in_action = true;
16330 +
16331         lock_system_sleep();
16332         /* The snapshot device should not be opened while we're running */
16333         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
16334 @@ -766,6 +779,7 @@ int hibernate(void)
16335         atomic_inc(&snapshot_device_available);
16336   Unlock:
16337         unlock_system_sleep();
16338 +       pm_in_action = false;
16339         return error;
16340  }
16341
16342 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
16343 index 6ccb08f57fcb..c8cbb5ed2fe3 100644
16344 --- a/kernel/power/suspend.c
16345 +++ b/kernel/power/suspend.c
16346 @@ -369,6 +369,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
16347         arch_suspend_disable_irqs();
16348         BUG_ON(!irqs_disabled());
16349
16350 +       system_state = SYSTEM_SUSPEND;
16351 +
16352         error = syscore_suspend();
16353         if (!error) {
16354                 *wakeup = pm_wakeup_pending();
16355 @@ -385,6 +387,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
16356                 syscore_resume();
16357         }
16358
16359 +       system_state = SYSTEM_RUNNING;
16360 +
16361         arch_suspend_enable_irqs();
16362         BUG_ON(irqs_disabled());
16363
16364 @@ -527,6 +531,8 @@ static int enter_state(suspend_state_t state)
16365         return error;
16366  }
16367
16368 +bool pm_in_action;
16369 +
16370  /**
16371   * pm_suspend - Externally visible function for suspending the system.
16372   * @state: System sleep state to enter.
16373 @@ -541,6 +547,8 @@ int pm_suspend(suspend_state_t state)
16374         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
16375                 return -EINVAL;
16376
16377 +       pm_in_action = true;
16378 +
16379         error = enter_state(state);
16380         if (error) {
16381                 suspend_stats.fail++;
16382 @@ -548,6 +556,7 @@ int pm_suspend(suspend_state_t state)
16383         } else {
16384                 suspend_stats.success++;
16385         }
16386 +       pm_in_action = false;
16387         return error;
16388  }
16389  EXPORT_SYMBOL(pm_suspend);
16390 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
16391 index 9c5b231684d0..cf15bdb6855b 100644
16392 --- a/kernel/printk/printk.c
16393 +++ b/kernel/printk/printk.c
16394 @@ -351,6 +351,65 @@ __packed __aligned(4)
16395   */
16396  DEFINE_RAW_SPINLOCK(logbuf_lock);
16397
16398 +#ifdef CONFIG_EARLY_PRINTK
16399 +struct console *early_console;
16400 +
16401 +static void early_vprintk(const char *fmt, va_list ap)
16402 +{
16403 +       if (early_console) {
16404 +               char buf[512];
16405 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
16406 +
16407 +               early_console->write(early_console, buf, n);
16408 +       }
16409 +}
16410 +
16411 +asmlinkage void early_printk(const char *fmt, ...)
16412 +{
16413 +       va_list ap;
16414 +
16415 +       va_start(ap, fmt);
16416 +       early_vprintk(fmt, ap);
16417 +       va_end(ap);
16418 +}
16419 +
16420 +/*
16421 + * This is independent of any log levels - a global
16422 + * kill switch that turns off all of printk.
16423 + *
16424 + * Used by the NMI watchdog if early-printk is enabled.
16425 + */
16426 +static bool __read_mostly printk_killswitch;
16427 +
16428 +static int __init force_early_printk_setup(char *str)
16429 +{
16430 +       printk_killswitch = true;
16431 +       return 0;
16432 +}
16433 +early_param("force_early_printk", force_early_printk_setup);
16434 +
16435 +void printk_kill(void)
16436 +{
16437 +       printk_killswitch = true;
16438 +}
16439 +
16440 +#ifdef CONFIG_PRINTK
16441 +static int forced_early_printk(const char *fmt, va_list ap)
16442 +{
16443 +       if (!printk_killswitch)
16444 +               return 0;
16445 +       early_vprintk(fmt, ap);
16446 +       return 1;
16447 +}
16448 +#endif
16449 +
16450 +#else
16451 +static inline int forced_early_printk(const char *fmt, va_list ap)
16452 +{
16453 +       return 0;
16454 +}
16455 +#endif
16456 +
16457  #ifdef CONFIG_PRINTK
16458  DECLARE_WAIT_QUEUE_HEAD(log_wait);
16459  /* the next printk record to read by syslog(READ) or /proc/kmsg */
16460 @@ -1337,6 +1396,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
16461  {
16462         char *text;
16463         int len = 0;
16464 +       int attempts = 0;
16465
16466         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
16467         if (!text)
16468 @@ -1348,6 +1408,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
16469                 u64 seq;
16470                 u32 idx;
16471                 enum log_flags prev;
16472 +               int num_msg;
16473 +try_again:
16474 +               attempts++;
16475 +               if (attempts > 10) {
16476 +                       len = -EBUSY;
16477 +                       goto out;
16478 +               }
16479 +               num_msg = 0;
16480
16481                 /*
16482                  * Find first record that fits, including all following records,
16483 @@ -1363,6 +1431,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
16484                         prev = msg->flags;
16485                         idx = log_next(idx);
16486                         seq++;
16487 +                       num_msg++;
16488 +                       if (num_msg > 5) {
16489 +                               num_msg = 0;
16490 +                               raw_spin_unlock_irq(&logbuf_lock);
16491 +                               raw_spin_lock_irq(&logbuf_lock);
16492 +                               if (clear_seq < log_first_seq)
16493 +                                       goto try_again;
16494 +                       }
16495                 }
16496
16497                 /* move first record forward until length fits into the buffer */
16498 @@ -1376,6 +1452,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
16499                         prev = msg->flags;
16500                         idx = log_next(idx);
16501                         seq++;
16502 +                       num_msg++;
16503 +                       if (num_msg > 5) {
16504 +                               num_msg = 0;
16505 +                               raw_spin_unlock_irq(&logbuf_lock);
16506 +                               raw_spin_lock_irq(&logbuf_lock);
16507 +                               if (clear_seq < log_first_seq)
16508 +                                       goto try_again;
16509 +                       }
16510                 }
16511
16512                 /* last message fitting into this dump */
16513 @@ -1416,6 +1500,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
16514                 clear_seq = log_next_seq;
16515                 clear_idx = log_next_idx;
16516         }
16517 +out:
16518         raw_spin_unlock_irq(&logbuf_lock);
16519
16520         kfree(text);
16521 @@ -1569,6 +1654,12 @@ static void call_console_drivers(int level,
16522         if (!console_drivers)
16523                 return;
16524
16525 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
16526 +               if (in_irq() || in_nmi())
16527 +                       return;
16528 +       }
16529 +
16530 +       migrate_disable();
16531         for_each_console(con) {
16532                 if (exclusive_console && con != exclusive_console)
16533                         continue;
16534 @@ -1584,6 +1675,7 @@ static void call_console_drivers(int level,
16535                 else
16536                         con->write(con, text, len);
16537         }
16538 +       migrate_enable();
16539  }
16540
16541  /*
16542 @@ -1781,6 +1873,13 @@ asmlinkage int vprintk_emit(int facility, int level,
16543         /* cpu currently holding logbuf_lock in this function */
16544         static unsigned int logbuf_cpu = UINT_MAX;
16545
16546 +       /*
16547 +        * Fall back to early_printk if a debugging subsystem has
16548 +        * killed printk output
16549 +        */
16550 +       if (unlikely(forced_early_printk(fmt, args)))
16551 +               return 1;
16552 +
16553         if (level == LOGLEVEL_SCHED) {
16554                 level = LOGLEVEL_DEFAULT;
16555                 in_sched = true;
16556 @@ -1885,13 +1984,23 @@ asmlinkage int vprintk_emit(int facility, int level,
16557
16558         /* If called from the scheduler, we can not call up(). */
16559         if (!in_sched) {
16560 +               int may_trylock = 1;
16561 +
16562                 lockdep_off();
16563 +#ifdef CONFIG_PREEMPT_RT_FULL
16564 +               /*
16565 +                * we can't take a sleeping lock with IRQs or preeption disabled
16566 +                * so we can't print in these contexts
16567 +                */
16568 +               if (!(preempt_count() == 0 && !irqs_disabled()))
16569 +                       may_trylock = 0;
16570 +#endif
16571                 /*
16572                  * Try to acquire and then immediately release the console
16573                  * semaphore.  The release will print out buffers and wake up
16574                  * /dev/kmsg and syslog() users.
16575                  */
16576 -               if (console_trylock())
16577 +               if (may_trylock && console_trylock())
16578                         console_unlock();
16579                 lockdep_on();
16580         }
16581 @@ -2014,26 +2123,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
16582
16583  #endif /* CONFIG_PRINTK */
16584
16585 -#ifdef CONFIG_EARLY_PRINTK
16586 -struct console *early_console;
16587 -
16588 -asmlinkage __visible void early_printk(const char *fmt, ...)
16589 -{
16590 -       va_list ap;
16591 -       char buf[512];
16592 -       int n;
16593 -
16594 -       if (!early_console)
16595 -               return;
16596 -
16597 -       va_start(ap, fmt);
16598 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
16599 -       va_end(ap);
16600 -
16601 -       early_console->write(early_console, buf, n);
16602 -}
16603 -#endif
16604 -
16605  static int __add_preferred_console(char *name, int idx, char *options,
16606                                    char *brl_options)
16607  {
16608 @@ -2303,11 +2392,16 @@ static void console_cont_flush(char *text, size_t size)
16609                 goto out;
16610
16611         len = cont_print_text(text, size);
16612 +#ifdef CONFIG_PREEMPT_RT_FULL
16613 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
16614 +       call_console_drivers(cont.level, NULL, 0, text, len);
16615 +#else
16616         raw_spin_unlock(&logbuf_lock);
16617         stop_critical_timings();
16618         call_console_drivers(cont.level, NULL, 0, text, len);
16619         start_critical_timings();
16620         local_irq_restore(flags);
16621 +#endif
16622         return;
16623  out:
16624         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
16625 @@ -2431,13 +2525,17 @@ void console_unlock(void)
16626                 console_idx = log_next(console_idx);
16627                 console_seq++;
16628                 console_prev = msg->flags;
16629 +#ifdef CONFIG_PREEMPT_RT_FULL
16630 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
16631 +               call_console_drivers(level, ext_text, ext_len, text, len);
16632 +#else
16633                 raw_spin_unlock(&logbuf_lock);
16634
16635                 stop_critical_timings();        /* don't trace print latency */
16636                 call_console_drivers(level, ext_text, ext_len, text, len);
16637                 start_critical_timings();
16638                 local_irq_restore(flags);
16639 -
16640 +#endif
16641                 if (do_cond_resched)
16642                         cond_resched();
16643         }
16644 @@ -2489,6 +2587,11 @@ void console_unblank(void)
16645  {
16646         struct console *c;
16647
16648 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
16649 +               if (in_irq() || in_nmi())
16650 +                       return;
16651 +       }
16652 +
16653         /*
16654          * console_unblank can no longer be called in interrupt context unless
16655          * oops_in_progress is set to 1..
16656 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
16657 index a5caecef88be..61e7c5e2183c 100644
16658 --- a/kernel/ptrace.c
16659 +++ b/kernel/ptrace.c
16660 @@ -166,7 +166,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
16661
16662         spin_lock_irq(&task->sighand->siglock);
16663         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
16664 -               task->state = __TASK_TRACED;
16665 +               unsigned long flags;
16666 +
16667 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
16668 +               if (task->state & __TASK_TRACED)
16669 +                       task->state = __TASK_TRACED;
16670 +               else
16671 +                       task->saved_state = __TASK_TRACED;
16672 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
16673                 ret = true;
16674         }
16675         spin_unlock_irq(&task->sighand->siglock);
16676 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
16677 index bf08fee53dc7..eeb8ce4ad7b6 100644
16678 --- a/kernel/rcu/rcutorture.c
16679 +++ b/kernel/rcu/rcutorture.c
16680 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
16681         .name           = "rcu"
16682  };
16683
16684 +#ifndef CONFIG_PREEMPT_RT_FULL
16685  /*
16686   * Definitions for rcu_bh torture testing.
16687   */
16688 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
16689         .name           = "rcu_bh"
16690  };
16691
16692 +#else
16693 +static struct rcu_torture_ops rcu_bh_ops = {
16694 +       .ttype          = INVALID_RCU_FLAVOR,
16695 +};
16696 +#endif
16697 +
16698  /*
16699   * Don't even think about trying any of these in real life!!!
16700   * The names includes "busted", and they really means it!
16701 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
16702 index 10f62c6f48e7..dbee19478f09 100644
16703 --- a/kernel/rcu/tree.c
16704 +++ b/kernel/rcu/tree.c
16705 @@ -55,6 +55,11 @@
16706  #include <linux/random.h>
16707  #include <linux/trace_events.h>
16708  #include <linux/suspend.h>
16709 +#include <linux/delay.h>
16710 +#include <linux/gfp.h>
16711 +#include <linux/oom.h>
16712 +#include <linux/smpboot.h>
16713 +#include "../time/tick-internal.h"
16714
16715  #include "tree.h"
16716  #include "rcu.h"
16717 @@ -260,6 +265,19 @@ void rcu_sched_qs(void)
16718                            this_cpu_ptr(&rcu_sched_data), true);
16719  }
16720
16721 +#ifdef CONFIG_PREEMPT_RT_FULL
16722 +static void rcu_preempt_qs(void);
16723 +
16724 +void rcu_bh_qs(void)
16725 +{
16726 +       unsigned long flags;
16727 +
16728 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
16729 +       local_irq_save(flags);
16730 +       rcu_preempt_qs();
16731 +       local_irq_restore(flags);
16732 +}
16733 +#else
16734  void rcu_bh_qs(void)
16735  {
16736         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
16737 @@ -269,6 +287,7 @@ void rcu_bh_qs(void)
16738                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
16739         }
16740  }
16741 +#endif
16742
16743  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
16744
16745 @@ -449,11 +468,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
16746  /*
16747   * Return the number of RCU BH batches started thus far for debug & stats.
16748   */
16749 +#ifndef CONFIG_PREEMPT_RT_FULL
16750  unsigned long rcu_batches_started_bh(void)
16751  {
16752         return rcu_bh_state.gpnum;
16753  }
16754  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
16755 +#endif
16756
16757  /*
16758   * Return the number of RCU batches completed thus far for debug & stats.
16759 @@ -473,6 +494,7 @@ unsigned long rcu_batches_completed_sched(void)
16760  }
16761  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
16762
16763 +#ifndef CONFIG_PREEMPT_RT_FULL
16764  /*
16765   * Return the number of RCU BH batches completed thus far for debug & stats.
16766   */
16767 @@ -481,6 +503,7 @@ unsigned long rcu_batches_completed_bh(void)
16768         return rcu_bh_state.completed;
16769  }
16770  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
16771 +#endif
16772
16773  /*
16774   * Return the number of RCU expedited batches completed thus far for
16775 @@ -504,6 +527,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
16776  }
16777  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
16778
16779 +#ifndef CONFIG_PREEMPT_RT_FULL
16780  /*
16781   * Force a quiescent state.
16782   */
16783 @@ -522,6 +546,13 @@ void rcu_bh_force_quiescent_state(void)
16784  }
16785  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
16786
16787 +#else
16788 +void rcu_force_quiescent_state(void)
16789 +{
16790 +}
16791 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
16792 +#endif
16793 +
16794  /*
16795   * Force a quiescent state for RCU-sched.
16796   */
16797 @@ -572,9 +603,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
16798         case RCU_FLAVOR:
16799                 rsp = rcu_state_p;
16800                 break;
16801 +#ifndef CONFIG_PREEMPT_RT_FULL
16802         case RCU_BH_FLAVOR:
16803                 rsp = &rcu_bh_state;
16804                 break;
16805 +#endif
16806         case RCU_SCHED_FLAVOR:
16807                 rsp = &rcu_sched_state;
16808                 break;
16809 @@ -3016,18 +3049,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
16810  /*
16811   * Do RCU core processing for the current CPU.
16812   */
16813 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
16814 +static __latent_entropy void rcu_process_callbacks(void)
16815  {
16816         struct rcu_state *rsp;
16817
16818         if (cpu_is_offline(smp_processor_id()))
16819                 return;
16820 -       trace_rcu_utilization(TPS("Start RCU core"));
16821         for_each_rcu_flavor(rsp)
16822                 __rcu_process_callbacks(rsp);
16823 -       trace_rcu_utilization(TPS("End RCU core"));
16824  }
16825
16826 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
16827  /*
16828   * Schedule RCU callback invocation.  If the specified type of RCU
16829   * does not support RCU priority boosting, just do a direct call,
16830 @@ -3039,19 +3071,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
16831  {
16832         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
16833                 return;
16834 -       if (likely(!rsp->boost)) {
16835 -               rcu_do_batch(rsp, rdp);
16836 -               return;
16837 -       }
16838 -       invoke_rcu_callbacks_kthread();
16839 +       rcu_do_batch(rsp, rdp);
16840  }
16841
16842 +static void rcu_wake_cond(struct task_struct *t, int status)
16843 +{
16844 +       /*
16845 +        * If the thread is yielding, only wake it when this
16846 +        * is invoked from idle
16847 +        */
16848 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
16849 +               wake_up_process(t);
16850 +}
16851 +
16852 +/*
16853 + * Wake up this CPU's rcuc kthread to do RCU core processing.
16854 + */
16855  static void invoke_rcu_core(void)
16856  {
16857 -       if (cpu_online(smp_processor_id()))
16858 -               raise_softirq(RCU_SOFTIRQ);
16859 +       unsigned long flags;
16860 +       struct task_struct *t;
16861 +
16862 +       if (!cpu_online(smp_processor_id()))
16863 +               return;
16864 +       local_irq_save(flags);
16865 +       __this_cpu_write(rcu_cpu_has_work, 1);
16866 +       t = __this_cpu_read(rcu_cpu_kthread_task);
16867 +       if (t != NULL && current != t)
16868 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
16869 +       local_irq_restore(flags);
16870  }
16871
16872 +static void rcu_cpu_kthread_park(unsigned int cpu)
16873 +{
16874 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16875 +}
16876 +
16877 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
16878 +{
16879 +       return __this_cpu_read(rcu_cpu_has_work);
16880 +}
16881 +
16882 +/*
16883 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16884 + * RCU softirq used in flavors and configurations of RCU that do not
16885 + * support RCU priority boosting.
16886 + */
16887 +static void rcu_cpu_kthread(unsigned int cpu)
16888 +{
16889 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16890 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16891 +       int spincnt;
16892 +
16893 +       for (spincnt = 0; spincnt < 10; spincnt++) {
16894 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16895 +               local_bh_disable();
16896 +               *statusp = RCU_KTHREAD_RUNNING;
16897 +               this_cpu_inc(rcu_cpu_kthread_loops);
16898 +               local_irq_disable();
16899 +               work = *workp;
16900 +               *workp = 0;
16901 +               local_irq_enable();
16902 +               if (work)
16903 +                       rcu_process_callbacks();
16904 +               local_bh_enable();
16905 +               if (*workp == 0) {
16906 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16907 +                       *statusp = RCU_KTHREAD_WAITING;
16908 +                       return;
16909 +               }
16910 +       }
16911 +       *statusp = RCU_KTHREAD_YIELDING;
16912 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16913 +       schedule_timeout_interruptible(2);
16914 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16915 +       *statusp = RCU_KTHREAD_WAITING;
16916 +}
16917 +
16918 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16919 +       .store                  = &rcu_cpu_kthread_task,
16920 +       .thread_should_run      = rcu_cpu_kthread_should_run,
16921 +       .thread_fn              = rcu_cpu_kthread,
16922 +       .thread_comm            = "rcuc/%u",
16923 +       .setup                  = rcu_cpu_kthread_setup,
16924 +       .park                   = rcu_cpu_kthread_park,
16925 +};
16926 +
16927 +/*
16928 + * Spawn per-CPU RCU core processing kthreads.
16929 + */
16930 +static int __init rcu_spawn_core_kthreads(void)
16931 +{
16932 +       int cpu;
16933 +
16934 +       for_each_possible_cpu(cpu)
16935 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
16936 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16937 +       return 0;
16938 +}
16939 +early_initcall(rcu_spawn_core_kthreads);
16940 +
16941  /*
16942   * Handle any core-RCU processing required by a call_rcu() invocation.
16943   */
16944 @@ -3195,6 +3314,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
16945  }
16946  EXPORT_SYMBOL_GPL(call_rcu_sched);
16947
16948 +#ifndef CONFIG_PREEMPT_RT_FULL
16949  /*
16950   * Queue an RCU callback for invocation after a quicker grace period.
16951   */
16952 @@ -3203,6 +3323,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
16953         __call_rcu(head, func, &rcu_bh_state, -1, 0);
16954  }
16955  EXPORT_SYMBOL_GPL(call_rcu_bh);
16956 +#endif
16957
16958  /*
16959   * Queue an RCU callback for lazy invocation after a grace period.
16960 @@ -3294,6 +3415,7 @@ void synchronize_sched(void)
16961  }
16962  EXPORT_SYMBOL_GPL(synchronize_sched);
16963
16964 +#ifndef CONFIG_PREEMPT_RT_FULL
16965  /**
16966   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
16967   *
16968 @@ -3320,6 +3442,7 @@ void synchronize_rcu_bh(void)
16969                 wait_rcu_gp(call_rcu_bh);
16970  }
16971  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
16972 +#endif
16973
16974  /**
16975   * get_state_synchronize_rcu - Snapshot current RCU state
16976 @@ -3698,6 +3821,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
16977         mutex_unlock(&rsp->barrier_mutex);
16978  }
16979
16980 +#ifndef CONFIG_PREEMPT_RT_FULL
16981  /**
16982   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
16983   */
16984 @@ -3706,6 +3830,7 @@ void rcu_barrier_bh(void)
16985         _rcu_barrier(&rcu_bh_state);
16986  }
16987  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
16988 +#endif
16989
16990  /**
16991   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
16992 @@ -4227,12 +4352,13 @@ void __init rcu_init(void)
16993
16994         rcu_bootup_announce();
16995         rcu_init_geometry();
16996 +#ifndef CONFIG_PREEMPT_RT_FULL
16997         rcu_init_one(&rcu_bh_state);
16998 +#endif
16999         rcu_init_one(&rcu_sched_state);
17000         if (dump_tree)
17001                 rcu_dump_rcu_node_tree(&rcu_sched_state);
17002         __rcu_init_preempt();
17003 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
17004
17005         /*
17006          * We don't need protection against CPU-hotplug here because
17007 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
17008 index e99a5234d9ed..958ac107062c 100644
17009 --- a/kernel/rcu/tree.h
17010 +++ b/kernel/rcu/tree.h
17011 @@ -588,18 +588,18 @@ extern struct list_head rcu_struct_flavors;
17012   */
17013  extern struct rcu_state rcu_sched_state;
17014
17015 +#ifndef CONFIG_PREEMPT_RT_FULL
17016  extern struct rcu_state rcu_bh_state;
17017 +#endif
17018
17019  #ifdef CONFIG_PREEMPT_RCU
17020  extern struct rcu_state rcu_preempt_state;
17021  #endif /* #ifdef CONFIG_PREEMPT_RCU */
17022
17023 -#ifdef CONFIG_RCU_BOOST
17024  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17025  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
17026  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17027  DECLARE_PER_CPU(char, rcu_cpu_has_work);
17028 -#endif /* #ifdef CONFIG_RCU_BOOST */
17029
17030  #ifndef RCU_TREE_NONCORE
17031
17032 @@ -619,10 +619,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
17033  static void __init __rcu_init_preempt(void);
17034  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
17035  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
17036 -static void invoke_rcu_callbacks_kthread(void);
17037  static bool rcu_is_callbacks_kthread(void);
17038 +static void rcu_cpu_kthread_setup(unsigned int cpu);
17039  #ifdef CONFIG_RCU_BOOST
17040 -static void rcu_preempt_do_callbacks(void);
17041  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
17042                                                  struct rcu_node *rnp);
17043  #endif /* #ifdef CONFIG_RCU_BOOST */
17044 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
17045 index 56583e764ebf..7c656f8e192f 100644
17046 --- a/kernel/rcu/tree_plugin.h
17047 +++ b/kernel/rcu/tree_plugin.h
17048 @@ -24,25 +24,10 @@
17049   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
17050   */
17051
17052 -#include <linux/delay.h>
17053 -#include <linux/gfp.h>
17054 -#include <linux/oom.h>
17055 -#include <linux/smpboot.h>
17056 -#include "../time/tick-internal.h"
17057 -
17058  #ifdef CONFIG_RCU_BOOST
17059
17060  #include "../locking/rtmutex_common.h"
17061
17062 -/*
17063 - * Control variables for per-CPU and per-rcu_node kthreads.  These
17064 - * handle all flavors of RCU.
17065 - */
17066 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
17067 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17068 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17069 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
17070 -
17071  #else /* #ifdef CONFIG_RCU_BOOST */
17072
17073  /*
17074 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
17075
17076  #endif /* #else #ifdef CONFIG_RCU_BOOST */
17077
17078 +/*
17079 + * Control variables for per-CPU and per-rcu_node kthreads.  These
17080 + * handle all flavors of RCU.
17081 + */
17082 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
17083 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
17084 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
17085 +
17086  #ifdef CONFIG_RCU_NOCB_CPU
17087  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
17088  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
17089 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
17090         }
17091
17092         /* Hardware IRQ handlers cannot block, complain if they get here. */
17093 -       if (in_irq() || in_serving_softirq()) {
17094 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
17095                 lockdep_rcu_suspicious(__FILE__, __LINE__,
17096                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
17097                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
17098 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
17099                 t->rcu_read_unlock_special.b.need_qs = true;
17100  }
17101
17102 -#ifdef CONFIG_RCU_BOOST
17103 -
17104 -static void rcu_preempt_do_callbacks(void)
17105 -{
17106 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
17107 -}
17108 -
17109 -#endif /* #ifdef CONFIG_RCU_BOOST */
17110 -
17111  /*
17112   * Queue a preemptible-RCU callback for invocation after a grace period.
17113   */
17114 @@ -829,6 +813,19 @@ void exit_rcu(void)
17115
17116  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
17117
17118 +/*
17119 + * If boosting, set rcuc kthreads to realtime priority.
17120 + */
17121 +static void rcu_cpu_kthread_setup(unsigned int cpu)
17122 +{
17123 +#ifdef CONFIG_RCU_BOOST
17124 +       struct sched_param sp;
17125 +
17126 +       sp.sched_priority = kthread_prio;
17127 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
17128 +#endif /* #ifdef CONFIG_RCU_BOOST */
17129 +}
17130 +
17131  #ifdef CONFIG_RCU_BOOST
17132
17133  #include "../locking/rtmutex_common.h"
17134 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
17135
17136  #endif /* #else #ifdef CONFIG_RCU_TRACE */
17137
17138 -static void rcu_wake_cond(struct task_struct *t, int status)
17139 -{
17140 -       /*
17141 -        * If the thread is yielding, only wake it when this
17142 -        * is invoked from idle
17143 -        */
17144 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
17145 -               wake_up_process(t);
17146 -}
17147 -
17148  /*
17149   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
17150   * or ->boost_tasks, advancing the pointer to the next task in the
17151 @@ -1013,23 +1000,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
17152  }
17153
17154  /*
17155 - * Wake up the per-CPU kthread to invoke RCU callbacks.
17156 - */
17157 -static void invoke_rcu_callbacks_kthread(void)
17158 -{
17159 -       unsigned long flags;
17160 -
17161 -       local_irq_save(flags);
17162 -       __this_cpu_write(rcu_cpu_has_work, 1);
17163 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
17164 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
17165 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
17166 -                             __this_cpu_read(rcu_cpu_kthread_status));
17167 -       }
17168 -       local_irq_restore(flags);
17169 -}
17170 -
17171 -/*
17172   * Is the current CPU running the RCU-callbacks kthread?
17173   * Caller must have preemption disabled.
17174   */
17175 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
17176         return 0;
17177  }
17178
17179 -static void rcu_kthread_do_work(void)
17180 -{
17181 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
17182 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
17183 -       rcu_preempt_do_callbacks();
17184 -}
17185 -
17186 -static void rcu_cpu_kthread_setup(unsigned int cpu)
17187 -{
17188 -       struct sched_param sp;
17189 -
17190 -       sp.sched_priority = kthread_prio;
17191 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
17192 -}
17193 -
17194 -static void rcu_cpu_kthread_park(unsigned int cpu)
17195 -{
17196 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
17197 -}
17198 -
17199 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
17200 -{
17201 -       return __this_cpu_read(rcu_cpu_has_work);
17202 -}
17203 -
17204 -/*
17205 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
17206 - * RCU softirq used in flavors and configurations of RCU that do not
17207 - * support RCU priority boosting.
17208 - */
17209 -static void rcu_cpu_kthread(unsigned int cpu)
17210 -{
17211 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
17212 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
17213 -       int spincnt;
17214 -
17215 -       for (spincnt = 0; spincnt < 10; spincnt++) {
17216 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
17217 -               local_bh_disable();
17218 -               *statusp = RCU_KTHREAD_RUNNING;
17219 -               this_cpu_inc(rcu_cpu_kthread_loops);
17220 -               local_irq_disable();
17221 -               work = *workp;
17222 -               *workp = 0;
17223 -               local_irq_enable();
17224 -               if (work)
17225 -                       rcu_kthread_do_work();
17226 -               local_bh_enable();
17227 -               if (*workp == 0) {
17228 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
17229 -                       *statusp = RCU_KTHREAD_WAITING;
17230 -                       return;
17231 -               }
17232 -       }
17233 -       *statusp = RCU_KTHREAD_YIELDING;
17234 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
17235 -       schedule_timeout_interruptible(2);
17236 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
17237 -       *statusp = RCU_KTHREAD_WAITING;
17238 -}
17239 -
17240  /*
17241   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
17242   * served by the rcu_node in question.  The CPU hotplug lock is still
17243 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
17244         free_cpumask_var(cm);
17245  }
17246
17247 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
17248 -       .store                  = &rcu_cpu_kthread_task,
17249 -       .thread_should_run      = rcu_cpu_kthread_should_run,
17250 -       .thread_fn              = rcu_cpu_kthread,
17251 -       .thread_comm            = "rcuc/%u",
17252 -       .setup                  = rcu_cpu_kthread_setup,
17253 -       .park                   = rcu_cpu_kthread_park,
17254 -};
17255 -
17256  /*
17257   * Spawn boost kthreads -- called as soon as the scheduler is running.
17258   */
17259  static void __init rcu_spawn_boost_kthreads(void)
17260  {
17261         struct rcu_node *rnp;
17262 -       int cpu;
17263 -
17264 -       for_each_possible_cpu(cpu)
17265 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
17266 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
17267         rcu_for_each_leaf_node(rcu_state_p, rnp)
17268                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
17269  }
17270 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
17271         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
17272  }
17273
17274 -static void invoke_rcu_callbacks_kthread(void)
17275 -{
17276 -       WARN_ON_ONCE(1);
17277 -}
17278 -
17279  static bool rcu_is_callbacks_kthread(void)
17280  {
17281         return false;
17282 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
17283
17284  #endif /* #else #ifdef CONFIG_RCU_BOOST */
17285
17286 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
17287 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
17288
17289  /*
17290   * Check to see if any future RCU-related work will need to be done
17291 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
17292         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
17293                ? 0 : rcu_cpu_has_callbacks(NULL);
17294  }
17295 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
17296
17297 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
17298  /*
17299   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
17300   * after it.
17301 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
17302         return cbs_ready;
17303  }
17304
17305 +#ifndef CONFIG_PREEMPT_RT_FULL
17306 +
17307  /*
17308   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
17309   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
17310 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
17311         *nextevt = basemono + dj * TICK_NSEC;
17312         return 0;
17313  }
17314 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
17315
17316  /*
17317   * Prepare a CPU for idle from an RCU perspective.  The first major task
17318 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
17319 index 4f6db7e6a117..ee02e1e1b3e5 100644
17320 --- a/kernel/rcu/update.c
17321 +++ b/kernel/rcu/update.c
17322 @@ -62,7 +62,7 @@
17323  #ifndef CONFIG_TINY_RCU
17324  module_param(rcu_expedited, int, 0);
17325  module_param(rcu_normal, int, 0);
17326 -static int rcu_normal_after_boot;
17327 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
17328  module_param(rcu_normal_after_boot, int, 0);
17329  #endif /* #ifndef CONFIG_TINY_RCU */
17330
17331 @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
17332  }
17333  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
17334
17335 -static atomic_t rcu_expedited_nesting =
17336 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
17337 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
17338
17339  /*
17340   * Should normal grace-period primitives be expedited?  Intended for
17341 @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
17342   */
17343  void rcu_end_inkernel_boot(void)
17344  {
17345 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
17346 -               rcu_unexpedite_gp();
17347 +       rcu_unexpedite_gp();
17348         if (rcu_normal_after_boot)
17349                 WRITE_ONCE(rcu_normal, 1);
17350  }
17351 @@ -298,6 +296,7 @@ int rcu_read_lock_held(void)
17352  }
17353  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
17354
17355 +#ifndef CONFIG_PREEMPT_RT_FULL
17356  /**
17357   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
17358   *
17359 @@ -324,6 +323,7 @@ int rcu_read_lock_bh_held(void)
17360         return in_softirq() || irqs_disabled();
17361  }
17362  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
17363 +#endif
17364
17365  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
17366
17367 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
17368 index 5e59b832ae2b..7337a7f60e3f 100644
17369 --- a/kernel/sched/Makefile
17370 +++ b/kernel/sched/Makefile
17371 @@ -17,7 +17,7 @@ endif
17372
17373  obj-y += core.o loadavg.o clock.o cputime.o
17374  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
17375 -obj-y += wait.o swait.o completion.o idle.o
17376 +obj-y += wait.o swait.o swork.o completion.o idle.o
17377  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
17378  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17379  obj-$(CONFIG_SCHEDSTATS) += stats.o
17380 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
17381 index 8d0f35debf35..b62cf6400fe0 100644
17382 --- a/kernel/sched/completion.c
17383 +++ b/kernel/sched/completion.c
17384 @@ -30,10 +30,10 @@ void complete(struct completion *x)
17385  {
17386         unsigned long flags;
17387
17388 -       spin_lock_irqsave(&x->wait.lock, flags);
17389 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17390         x->done++;
17391 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
17392 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17393 +       swake_up_locked(&x->wait);
17394 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17395  }
17396  EXPORT_SYMBOL(complete);
17397
17398 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
17399  {
17400         unsigned long flags;
17401
17402 -       spin_lock_irqsave(&x->wait.lock, flags);
17403 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17404         x->done += UINT_MAX/2;
17405 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
17406 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17407 +       swake_up_all_locked(&x->wait);
17408 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17409  }
17410  EXPORT_SYMBOL(complete_all);
17411
17412 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
17413                    long (*action)(long), long timeout, int state)
17414  {
17415         if (!x->done) {
17416 -               DECLARE_WAITQUEUE(wait, current);
17417 +               DECLARE_SWAITQUEUE(wait);
17418
17419 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
17420 +               __prepare_to_swait(&x->wait, &wait);
17421                 do {
17422                         if (signal_pending_state(state, current)) {
17423                                 timeout = -ERESTARTSYS;
17424                                 break;
17425                         }
17426                         __set_current_state(state);
17427 -                       spin_unlock_irq(&x->wait.lock);
17428 +                       raw_spin_unlock_irq(&x->wait.lock);
17429                         timeout = action(timeout);
17430 -                       spin_lock_irq(&x->wait.lock);
17431 +                       raw_spin_lock_irq(&x->wait.lock);
17432                 } while (!x->done && timeout);
17433 -               __remove_wait_queue(&x->wait, &wait);
17434 +               __finish_swait(&x->wait, &wait);
17435                 if (!x->done)
17436                         return timeout;
17437         }
17438 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
17439  {
17440         might_sleep();
17441
17442 -       spin_lock_irq(&x->wait.lock);
17443 +       raw_spin_lock_irq(&x->wait.lock);
17444         timeout = do_wait_for_common(x, action, timeout, state);
17445 -       spin_unlock_irq(&x->wait.lock);
17446 +       raw_spin_unlock_irq(&x->wait.lock);
17447         return timeout;
17448  }
17449
17450 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
17451         if (!READ_ONCE(x->done))
17452                 return 0;
17453
17454 -       spin_lock_irqsave(&x->wait.lock, flags);
17455 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
17456         if (!x->done)
17457                 ret = 0;
17458         else
17459                 x->done--;
17460 -       spin_unlock_irqrestore(&x->wait.lock, flags);
17461 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
17462         return ret;
17463  }
17464  EXPORT_SYMBOL(try_wait_for_completion);
17465 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
17466          * after it's acquired the lock.
17467          */
17468         smp_rmb();
17469 -       spin_unlock_wait(&x->wait.lock);
17470 +       raw_spin_unlock_wait(&x->wait.lock);
17471         return true;
17472  }
17473  EXPORT_SYMBOL(completion_done);
17474 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
17475 index 154fd689fe02..30b24f774198 100644
17476 --- a/kernel/sched/core.c
17477 +++ b/kernel/sched/core.c
17478 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
17479   * Number of tasks to iterate in a single balance run.
17480   * Limited because this is done with IRQs disabled.
17481   */
17482 +#ifndef CONFIG_PREEMPT_RT_FULL
17483  const_debug unsigned int sysctl_sched_nr_migrate = 32;
17484 +#else
17485 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
17486 +#endif
17487
17488  /*
17489   * period over which we average the RT time consumption, measured
17490 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
17491
17492         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17493         rq->hrtick_timer.function = hrtick;
17494 +       rq->hrtick_timer.irqsafe = 1;
17495  }
17496  #else  /* CONFIG_SCHED_HRTICK */
17497  static inline void hrtick_clear(struct rq *rq)
17498 @@ -449,7 +454,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
17499         head->lastp = &node->next;
17500  }
17501
17502 -void wake_up_q(struct wake_q_head *head)
17503 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
17504  {
17505         struct wake_q_node *node = head->first;
17506
17507 @@ -466,7 +471,10 @@ void wake_up_q(struct wake_q_head *head)
17508                  * wake_up_process() implies a wmb() to pair with the queueing
17509                  * in wake_q_add() so as not to miss wakeups.
17510                  */
17511 -               wake_up_process(task);
17512 +               if (sleeper)
17513 +                       wake_up_lock_sleeper(task);
17514 +               else
17515 +                       wake_up_process(task);
17516                 put_task_struct(task);
17517         }
17518  }
17519 @@ -502,6 +510,38 @@ void resched_curr(struct rq *rq)
17520                 trace_sched_wake_idle_without_ipi(cpu);
17521  }
17522
17523 +#ifdef CONFIG_PREEMPT_LAZY
17524 +void resched_curr_lazy(struct rq *rq)
17525 +{
17526 +       struct task_struct *curr = rq->curr;
17527 +       int cpu;
17528 +
17529 +       if (!sched_feat(PREEMPT_LAZY)) {
17530 +               resched_curr(rq);
17531 +               return;
17532 +       }
17533 +
17534 +       lockdep_assert_held(&rq->lock);
17535 +
17536 +       if (test_tsk_need_resched(curr))
17537 +               return;
17538 +
17539 +       if (test_tsk_need_resched_lazy(curr))
17540 +               return;
17541 +
17542 +       set_tsk_need_resched_lazy(curr);
17543 +
17544 +       cpu = cpu_of(rq);
17545 +       if (cpu == smp_processor_id())
17546 +               return;
17547 +
17548 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
17549 +       smp_mb();
17550 +       if (!tsk_is_polling(curr))
17551 +               smp_send_reschedule(cpu);
17552 +}
17553 +#endif
17554 +
17555  void resched_cpu(int cpu)
17556  {
17557         struct rq *rq = cpu_rq(cpu);
17558 @@ -525,11 +565,14 @@ void resched_cpu(int cpu)
17559   */
17560  int get_nohz_timer_target(void)
17561  {
17562 -       int i, cpu = smp_processor_id();
17563 +       int i, cpu;
17564         struct sched_domain *sd;
17565
17566 +       preempt_disable_rt();
17567 +       cpu = smp_processor_id();
17568 +
17569         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
17570 -               return cpu;
17571 +               goto preempt_en_rt;
17572
17573         rcu_read_lock();
17574         for_each_domain(cpu, sd) {
17575 @@ -548,6 +591,8 @@ int get_nohz_timer_target(void)
17576                 cpu = housekeeping_any_cpu();
17577  unlock:
17578         rcu_read_unlock();
17579 +preempt_en_rt:
17580 +       preempt_enable_rt();
17581         return cpu;
17582  }
17583  /*
17584 @@ -1100,6 +1145,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
17585
17586         lockdep_assert_held(&p->pi_lock);
17587
17588 +       if (__migrate_disabled(p)) {
17589 +               cpumask_copy(&p->cpus_allowed, new_mask);
17590 +               return;
17591 +       }
17592 +
17593         queued = task_on_rq_queued(p);
17594         running = task_current(rq, p);
17595
17596 @@ -1122,6 +1172,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
17597                 set_curr_task(rq, p);
17598  }
17599
17600 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
17601 +static DEFINE_MUTEX(sched_down_mutex);
17602 +static cpumask_t sched_down_cpumask;
17603 +
17604 +void tell_sched_cpu_down_begin(int cpu)
17605 +{
17606 +       mutex_lock(&sched_down_mutex);
17607 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
17608 +       mutex_unlock(&sched_down_mutex);
17609 +}
17610 +
17611 +void tell_sched_cpu_down_done(int cpu)
17612 +{
17613 +       mutex_lock(&sched_down_mutex);
17614 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
17615 +       mutex_unlock(&sched_down_mutex);
17616 +}
17617 +
17618 +/**
17619 + * migrate_me - try to move the current task off this cpu
17620 + *
17621 + * Used by the pin_current_cpu() code to try to get tasks
17622 + * to move off the current CPU as it is going down.
17623 + * It will only move the task if the task isn't pinned to
17624 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
17625 + * and the task has to be in a RUNNING state. Otherwise the
17626 + * movement of the task will wake it up (change its state
17627 + * to running) when the task did not expect it.
17628 + *
17629 + * Returns 1 if it succeeded in moving the current task
17630 + *         0 otherwise.
17631 + */
17632 +int migrate_me(void)
17633 +{
17634 +       struct task_struct *p = current;
17635 +       struct migration_arg arg;
17636 +       struct cpumask *cpumask;
17637 +       struct cpumask *mask;
17638 +       unsigned int dest_cpu;
17639 +       struct rq_flags rf;
17640 +       struct rq *rq;
17641 +
17642 +       /*
17643 +        * We can not migrate tasks bounded to a CPU or tasks not
17644 +        * running. The movement of the task will wake it up.
17645 +        */
17646 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
17647 +               return 0;
17648 +
17649 +       mutex_lock(&sched_down_mutex);
17650 +       rq = task_rq_lock(p, &rf);
17651 +
17652 +       cpumask = this_cpu_ptr(&sched_cpumasks);
17653 +       mask = &p->cpus_allowed;
17654 +
17655 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
17656 +
17657 +       if (!cpumask_weight(cpumask)) {
17658 +               /* It's only on this CPU? */
17659 +               task_rq_unlock(rq, p, &rf);
17660 +               mutex_unlock(&sched_down_mutex);
17661 +               return 0;
17662 +       }
17663 +
17664 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
17665 +
17666 +       arg.task = p;
17667 +       arg.dest_cpu = dest_cpu;
17668 +
17669 +       task_rq_unlock(rq, p, &rf);
17670 +
17671 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
17672 +       tlb_migrate_finish(p->mm);
17673 +       mutex_unlock(&sched_down_mutex);
17674 +
17675 +       return 1;
17676 +}
17677 +
17678  /*
17679   * Change a given task's CPU affinity. Migrate the thread to a
17680   * proper CPU and schedule it away if the CPU it's executing on
17681 @@ -1179,7 +1307,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
17682         }
17683
17684         /* Can the task run on the task's current CPU? If so, we're done */
17685 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
17686 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
17687                 goto out;
17688
17689         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
17690 @@ -1366,6 +1494,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
17691         return ret;
17692  }
17693
17694 +static bool check_task_state(struct task_struct *p, long match_state)
17695 +{
17696 +       bool match = false;
17697 +
17698 +       raw_spin_lock_irq(&p->pi_lock);
17699 +       if (p->state == match_state || p->saved_state == match_state)
17700 +               match = true;
17701 +       raw_spin_unlock_irq(&p->pi_lock);
17702 +
17703 +       return match;
17704 +}
17705 +
17706  /*
17707   * wait_task_inactive - wait for a thread to unschedule.
17708   *
17709 @@ -1410,7 +1550,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
17710                  * is actually now running somewhere else!
17711                  */
17712                 while (task_running(rq, p)) {
17713 -                       if (match_state && unlikely(p->state != match_state))
17714 +                       if (match_state && !check_task_state(p, match_state))
17715                                 return 0;
17716                         cpu_relax();
17717                 }
17718 @@ -1425,7 +1565,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
17719                 running = task_running(rq, p);
17720                 queued = task_on_rq_queued(p);
17721                 ncsw = 0;
17722 -               if (!match_state || p->state == match_state)
17723 +               if (!match_state || p->state == match_state ||
17724 +                   p->saved_state == match_state)
17725                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
17726                 task_rq_unlock(rq, p, &rf);
17727
17728 @@ -1680,10 +1821,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
17729  {
17730         activate_task(rq, p, en_flags);
17731         p->on_rq = TASK_ON_RQ_QUEUED;
17732 -
17733 -       /* if a worker is waking up, notify workqueue */
17734 -       if (p->flags & PF_WQ_WORKER)
17735 -               wq_worker_waking_up(p, cpu_of(rq));
17736  }
17737
17738  /*
17739 @@ -2018,8 +2155,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
17740          */
17741         smp_mb__before_spinlock();
17742         raw_spin_lock_irqsave(&p->pi_lock, flags);
17743 -       if (!(p->state & state))
17744 +       if (!(p->state & state)) {
17745 +               /*
17746 +                * The task might be running due to a spinlock sleeper
17747 +                * wakeup. Check the saved state and set it to running
17748 +                * if the wakeup condition is true.
17749 +                */
17750 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
17751 +                       if (p->saved_state & state) {
17752 +                               p->saved_state = TASK_RUNNING;
17753 +                               success = 1;
17754 +                       }
17755 +               }
17756                 goto out;
17757 +       }
17758 +
17759 +       /*
17760 +        * If this is a regular wakeup, then we can unconditionally
17761 +        * clear the saved state of a "lock sleeper".
17762 +        */
17763 +       if (!(wake_flags & WF_LOCK_SLEEPER))
17764 +               p->saved_state = TASK_RUNNING;
17765
17766         trace_sched_waking(p);
17767
17768 @@ -2102,53 +2258,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
17769  }
17770
17771  /**
17772 - * try_to_wake_up_local - try to wake up a local task with rq lock held
17773 - * @p: the thread to be awakened
17774 - * @cookie: context's cookie for pinning
17775 - *
17776 - * Put @p on the run-queue if it's not already there. The caller must
17777 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
17778 - * the current task.
17779 - */
17780 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
17781 -{
17782 -       struct rq *rq = task_rq(p);
17783 -
17784 -       if (WARN_ON_ONCE(rq != this_rq()) ||
17785 -           WARN_ON_ONCE(p == current))
17786 -               return;
17787 -
17788 -       lockdep_assert_held(&rq->lock);
17789 -
17790 -       if (!raw_spin_trylock(&p->pi_lock)) {
17791 -               /*
17792 -                * This is OK, because current is on_cpu, which avoids it being
17793 -                * picked for load-balance and preemption/IRQs are still
17794 -                * disabled avoiding further scheduler activity on it and we've
17795 -                * not yet picked a replacement task.
17796 -                */
17797 -               lockdep_unpin_lock(&rq->lock, cookie);
17798 -               raw_spin_unlock(&rq->lock);
17799 -               raw_spin_lock(&p->pi_lock);
17800 -               raw_spin_lock(&rq->lock);
17801 -               lockdep_repin_lock(&rq->lock, cookie);
17802 -       }
17803 -
17804 -       if (!(p->state & TASK_NORMAL))
17805 -               goto out;
17806 -
17807 -       trace_sched_waking(p);
17808 -
17809 -       if (!task_on_rq_queued(p))
17810 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
17811 -
17812 -       ttwu_do_wakeup(rq, p, 0, cookie);
17813 -       ttwu_stat(p, smp_processor_id(), 0);
17814 -out:
17815 -       raw_spin_unlock(&p->pi_lock);
17816 -}
17817 -
17818 -/**
17819   * wake_up_process - Wake up a specific process
17820   * @p: The process to be woken up.
17821   *
17822 @@ -2166,6 +2275,18 @@ int wake_up_process(struct task_struct *p)
17823  }
17824  EXPORT_SYMBOL(wake_up_process);
17825
17826 +/**
17827 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
17828 + * @p: The process to be woken up.
17829 + *
17830 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
17831 + * the nature of the wakeup.
17832 + */
17833 +int wake_up_lock_sleeper(struct task_struct *p)
17834 +{
17835 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
17836 +}
17837 +
17838  int wake_up_state(struct task_struct *p, unsigned int state)
17839  {
17840         return try_to_wake_up(p, state, 0);
17841 @@ -2442,6 +2563,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
17842         p->on_cpu = 0;
17843  #endif
17844         init_task_preempt_count(p);
17845 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
17846 +       task_thread_info(p)->preempt_lazy_count = 0;
17847 +#endif
17848  #ifdef CONFIG_SMP
17849         plist_node_init(&p->pushable_tasks, MAX_PRIO);
17850         RB_CLEAR_NODE(&p->pushable_dl_tasks);
17851 @@ -2770,21 +2894,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
17852         finish_arch_post_lock_switch();
17853
17854         fire_sched_in_preempt_notifiers(current);
17855 +       /*
17856 +        * We use mmdrop_delayed() here so we don't have to do the
17857 +        * full __mmdrop() when we are the last user.
17858 +        */
17859         if (mm)
17860 -               mmdrop(mm);
17861 +               mmdrop_delayed(mm);
17862         if (unlikely(prev_state == TASK_DEAD)) {
17863                 if (prev->sched_class->task_dead)
17864                         prev->sched_class->task_dead(prev);
17865
17866 -               /*
17867 -                * Remove function-return probe instances associated with this
17868 -                * task and put them back on the free list.
17869 -                */
17870 -               kprobe_flush_task(prev);
17871 -
17872 -               /* Task is done with its stack. */
17873 -               put_task_stack(prev);
17874 -
17875                 put_task_struct(prev);
17876         }
17877
17878 @@ -3252,6 +3371,77 @@ static inline void schedule_debug(struct task_struct *prev)
17879         schedstat_inc(this_rq()->sched_count);
17880  }
17881
17882 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
17883 +
17884 +void migrate_disable(void)
17885 +{
17886 +       struct task_struct *p = current;
17887 +
17888 +       if (in_atomic() || irqs_disabled()) {
17889 +#ifdef CONFIG_SCHED_DEBUG
17890 +               p->migrate_disable_atomic++;
17891 +#endif
17892 +               return;
17893 +       }
17894 +
17895 +#ifdef CONFIG_SCHED_DEBUG
17896 +       if (unlikely(p->migrate_disable_atomic)) {
17897 +               tracing_off();
17898 +               WARN_ON_ONCE(1);
17899 +       }
17900 +#endif
17901 +
17902 +       if (p->migrate_disable) {
17903 +               p->migrate_disable++;
17904 +               return;
17905 +       }
17906 +
17907 +       preempt_disable();
17908 +       preempt_lazy_disable();
17909 +       pin_current_cpu();
17910 +       p->migrate_disable = 1;
17911 +       preempt_enable();
17912 +}
17913 +EXPORT_SYMBOL(migrate_disable);
17914 +
17915 +void migrate_enable(void)
17916 +{
17917 +       struct task_struct *p = current;
17918 +
17919 +       if (in_atomic() || irqs_disabled()) {
17920 +#ifdef CONFIG_SCHED_DEBUG
17921 +               p->migrate_disable_atomic--;
17922 +#endif
17923 +               return;
17924 +       }
17925 +
17926 +#ifdef CONFIG_SCHED_DEBUG
17927 +       if (unlikely(p->migrate_disable_atomic)) {
17928 +               tracing_off();
17929 +               WARN_ON_ONCE(1);
17930 +       }
17931 +#endif
17932 +       WARN_ON_ONCE(p->migrate_disable <= 0);
17933 +
17934 +       if (p->migrate_disable > 1) {
17935 +               p->migrate_disable--;
17936 +               return;
17937 +       }
17938 +
17939 +       preempt_disable();
17940 +       /*
17941 +        * Clearing migrate_disable causes tsk_cpus_allowed to
17942 +        * show the tasks original cpu affinity.
17943 +        */
17944 +       p->migrate_disable = 0;
17945 +
17946 +       unpin_current_cpu();
17947 +       preempt_enable();
17948 +       preempt_lazy_enable();
17949 +}
17950 +EXPORT_SYMBOL(migrate_enable);
17951 +#endif
17952 +
17953  /*
17954   * Pick up the highest-prio task:
17955   */
17956 @@ -3368,19 +3558,6 @@ static void __sched notrace __schedule(bool preempt)
17957                 } else {
17958                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
17959                         prev->on_rq = 0;
17960 -
17961 -                       /*
17962 -                        * If a worker went to sleep, notify and ask workqueue
17963 -                        * whether it wants to wake up a task to maintain
17964 -                        * concurrency.
17965 -                        */
17966 -                       if (prev->flags & PF_WQ_WORKER) {
17967 -                               struct task_struct *to_wakeup;
17968 -
17969 -                               to_wakeup = wq_worker_sleeping(prev);
17970 -                               if (to_wakeup)
17971 -                                       try_to_wake_up_local(to_wakeup, cookie);
17972 -                       }
17973                 }
17974                 switch_count = &prev->nvcsw;
17975         }
17976 @@ -3390,6 +3567,7 @@ static void __sched notrace __schedule(bool preempt)
17977
17978         next = pick_next_task(rq, prev, cookie);
17979         clear_tsk_need_resched(prev);
17980 +       clear_tsk_need_resched_lazy(prev);
17981         clear_preempt_need_resched();
17982         rq->clock_skip_update = 0;
17983
17984 @@ -3437,9 +3615,20 @@ void __noreturn do_task_dead(void)
17985
17986  static inline void sched_submit_work(struct task_struct *tsk)
17987  {
17988 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
17989 +       if (!tsk->state)
17990                 return;
17991         /*
17992 +        * If a worker went to sleep, notify and ask workqueue whether
17993 +        * it wants to wake up a task to maintain concurrency.
17994 +        */
17995 +       if (tsk->flags & PF_WQ_WORKER)
17996 +               wq_worker_sleeping(tsk);
17997 +
17998 +
17999 +       if (tsk_is_pi_blocked(tsk))
18000 +               return;
18001 +
18002 +       /*
18003          * If we are going to sleep and we have plugged IO queued,
18004          * make sure to submit it to avoid deadlocks.
18005          */
18006 @@ -3447,6 +3636,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
18007                 blk_schedule_flush_plug(tsk);
18008  }
18009
18010 +static void sched_update_worker(struct task_struct *tsk)
18011 +{
18012 +       if (tsk->flags & PF_WQ_WORKER)
18013 +               wq_worker_running(tsk);
18014 +}
18015 +
18016  asmlinkage __visible void __sched schedule(void)
18017  {
18018         struct task_struct *tsk = current;
18019 @@ -3457,6 +3652,7 @@ asmlinkage __visible void __sched schedule(void)
18020                 __schedule(false);
18021                 sched_preempt_enable_no_resched();
18022         } while (need_resched());
18023 +       sched_update_worker(tsk);
18024  }
18025  EXPORT_SYMBOL(schedule);
18026
18027 @@ -3520,6 +3716,30 @@ static void __sched notrace preempt_schedule_common(void)
18028         } while (need_resched());
18029  }
18030
18031 +#ifdef CONFIG_PREEMPT_LAZY
18032 +/*
18033 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
18034 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
18035 + * preempt_lazy_count counter >0.
18036 + */
18037 +static __always_inline int preemptible_lazy(void)
18038 +{
18039 +       if (test_thread_flag(TIF_NEED_RESCHED))
18040 +               return 1;
18041 +       if (current_thread_info()->preempt_lazy_count)
18042 +               return 0;
18043 +       return 1;
18044 +}
18045 +
18046 +#else
18047 +
18048 +static inline int preemptible_lazy(void)
18049 +{
18050 +       return 1;
18051 +}
18052 +
18053 +#endif
18054 +
18055  #ifdef CONFIG_PREEMPT
18056  /*
18057   * this is the entry point to schedule() from in-kernel preemption
18058 @@ -3534,7 +3754,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
18059          */
18060         if (likely(!preemptible()))
18061                 return;
18062 -
18063 +       if (!preemptible_lazy())
18064 +               return;
18065         preempt_schedule_common();
18066  }
18067  NOKPROBE_SYMBOL(preempt_schedule);
18068 @@ -3561,6 +3782,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
18069         if (likely(!preemptible()))
18070                 return;
18071
18072 +       if (!preemptible_lazy())
18073 +               return;
18074 +
18075         do {
18076                 /*
18077                  * Because the function tracer can trace preempt_count_sub()
18078 @@ -3583,7 +3807,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
18079                  * an infinite recursion.
18080                  */
18081                 prev_ctx = exception_enter();
18082 +               /*
18083 +                * The add/subtract must not be traced by the function
18084 +                * tracer. But we still want to account for the
18085 +                * preempt off latency tracer. Since the _notrace versions
18086 +                * of add/subtract skip the accounting for latency tracer
18087 +                * we must force it manually.
18088 +                */
18089 +               start_critical_timings();
18090                 __schedule(true);
18091 +               stop_critical_timings();
18092                 exception_exit(prev_ctx);
18093
18094                 preempt_latency_stop(1);
18095 @@ -3629,10 +3862,25 @@ EXPORT_SYMBOL(default_wake_function);
18096
18097  #ifdef CONFIG_RT_MUTEXES
18098
18099 +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
18100 +{
18101 +       if (pi_task)
18102 +               prio = min(prio, pi_task->prio);
18103 +
18104 +       return prio;
18105 +}
18106 +
18107 +static inline int rt_effective_prio(struct task_struct *p, int prio)
18108 +{
18109 +       struct task_struct *pi_task = rt_mutex_get_top_task(p);
18110 +
18111 +       return __rt_effective_prio(pi_task, prio);
18112 +}
18113 +
18114  /*
18115   * rt_mutex_setprio - set the current priority of a task
18116 - * @p: task
18117 - * @prio: prio value (kernel-internal form)
18118 + * @p: task to boost
18119 + * @pi_task: donor task
18120   *
18121   * This function changes the 'effective' priority of a task. It does
18122   * not touch ->normal_prio like __setscheduler().
18123 @@ -3640,16 +3888,40 @@ EXPORT_SYMBOL(default_wake_function);
18124   * Used by the rt_mutex code to implement priority inheritance
18125   * logic. Call site only calls if the priority of the task changed.
18126   */
18127 -void rt_mutex_setprio(struct task_struct *p, int prio)
18128 +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
18129  {
18130 -       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
18131 +       int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
18132         const struct sched_class *prev_class;
18133         struct rq_flags rf;
18134         struct rq *rq;
18135
18136 -       BUG_ON(prio > MAX_PRIO);
18137 +       /* XXX used to be waiter->prio, not waiter->task->prio */
18138 +       prio = __rt_effective_prio(pi_task, p->normal_prio);
18139 +
18140 +       /*
18141 +        * If nothing changed; bail early.
18142 +        */
18143 +       if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
18144 +               return;
18145
18146         rq = __task_rq_lock(p, &rf);
18147 +       /*
18148 +        * Set under pi_lock && rq->lock, such that the value can be used under
18149 +        * either lock.
18150 +        *
18151 +        * Note that there is loads of tricky to make this pointer cache work
18152 +        * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
18153 +        * ensure a task is de-boosted (pi_task is set to NULL) before the
18154 +        * task is allowed to run again (and can exit). This ensures the pointer
18155 +        * points to a blocked task -- which guaratees the task is present.
18156 +        */
18157 +       p->pi_top_task = pi_task;
18158 +
18159 +       /*
18160 +        * For FIFO/RR we only need to set prio, if that matches we're done.
18161 +        */
18162 +       if (prio == p->prio && !dl_prio(prio))
18163 +               goto out_unlock;
18164
18165         /*
18166          * Idle task boosting is a nono in general. There is one
18167 @@ -3669,7 +3941,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18168                 goto out_unlock;
18169         }
18170
18171 -       trace_sched_pi_setprio(p, prio);
18172 +       trace_sched_pi_setprio(p, pi_task);
18173         oldprio = p->prio;
18174
18175         if (oldprio == prio)
18176 @@ -3693,7 +3965,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18177          *          running task
18178          */
18179         if (dl_prio(prio)) {
18180 -               struct task_struct *pi_task = rt_mutex_get_top_task(p);
18181                 if (!dl_prio(p->normal_prio) ||
18182                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
18183                         p->dl.dl_boosted = 1;
18184 @@ -3730,6 +4001,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
18185         balance_callback(rq);
18186         preempt_enable();
18187  }
18188 +#else
18189 +static inline int rt_effective_prio(struct task_struct *p, int prio)
18190 +{
18191 +       return prio;
18192 +}
18193  #endif
18194
18195  void set_user_nice(struct task_struct *p, long nice)
18196 @@ -3974,10 +4250,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
18197          * Keep a potential priority boosting if called from
18198          * sched_setscheduler().
18199          */
18200 +       p->prio = normal_prio(p);
18201         if (keep_boost)
18202 -               p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
18203 -       else
18204 -               p->prio = normal_prio(p);
18205 +               p->prio = rt_effective_prio(p, p->prio);
18206
18207         if (dl_prio(p->prio))
18208                 p->sched_class = &dl_sched_class;
18209 @@ -4264,7 +4539,7 @@ static int __sched_setscheduler(struct task_struct *p,
18210                  * the runqueue. This will be done when the task deboost
18211                  * itself.
18212                  */
18213 -               new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
18214 +               new_effective_prio = rt_effective_prio(p, newprio);
18215                 if (new_effective_prio == oldprio)
18216                         queue_flags &= ~DEQUEUE_MOVE;
18217         }
18218 @@ -4939,6 +5214,7 @@ int __cond_resched_lock(spinlock_t *lock)
18219  }
18220  EXPORT_SYMBOL(__cond_resched_lock);
18221
18222 +#ifndef CONFIG_PREEMPT_RT_FULL
18223  int __sched __cond_resched_softirq(void)
18224  {
18225         BUG_ON(!in_softirq());
18226 @@ -4952,6 +5228,7 @@ int __sched __cond_resched_softirq(void)
18227         return 0;
18228  }
18229  EXPORT_SYMBOL(__cond_resched_softirq);
18230 +#endif
18231
18232  /**
18233   * yield - yield the current processor to other threads.
18234 @@ -5315,7 +5592,9 @@ void init_idle(struct task_struct *idle, int cpu)
18235
18236         /* Set the preempt count _outside_ the spinlocks! */
18237         init_idle_preempt_count(idle, cpu);
18238 -
18239 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
18240 +       task_thread_info(idle)->preempt_lazy_count = 0;
18241 +#endif
18242         /*
18243          * The idle tasks have their own, simple scheduling class:
18244          */
18245 @@ -5458,6 +5737,8 @@ void sched_setnuma(struct task_struct *p, int nid)
18246  #endif /* CONFIG_NUMA_BALANCING */
18247
18248  #ifdef CONFIG_HOTPLUG_CPU
18249 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
18250 +
18251  /*
18252   * Ensures that the idle task is using init_mm right before its cpu goes
18253   * offline.
18254 @@ -5472,7 +5753,12 @@ void idle_task_exit(void)
18255                 switch_mm_irqs_off(mm, &init_mm, current);
18256                 finish_arch_post_lock_switch();
18257         }
18258 -       mmdrop(mm);
18259 +       /*
18260 +        * Defer the cleanup to an alive cpu. On RT we can neither
18261 +        * call mmdrop() nor mmdrop_delayed() from here.
18262 +        */
18263 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
18264 +
18265  }
18266
18267  /*
18268 @@ -7418,6 +7704,10 @@ int sched_cpu_dying(unsigned int cpu)
18269         update_max_interval();
18270         nohz_balance_exit_idle(cpu);
18271         hrtick_clear(rq);
18272 +       if (per_cpu(idle_last_mm, cpu)) {
18273 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
18274 +               per_cpu(idle_last_mm, cpu) = NULL;
18275 +       }
18276         return 0;
18277  }
18278  #endif
18279 @@ -7698,7 +7988,7 @@ void __init sched_init(void)
18280  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
18281  static inline int preempt_count_equals(int preempt_offset)
18282  {
18283 -       int nested = preempt_count() + rcu_preempt_depth();
18284 +       int nested = preempt_count() + sched_rcu_preempt_depth();
18285
18286         return (nested == preempt_offset);
18287  }
18288 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
18289 index c95c5122b105..e00accf92a4b 100644
18290 --- a/kernel/sched/deadline.c
18291 +++ b/kernel/sched/deadline.c
18292 @@ -687,6 +687,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
18293
18294         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
18295         timer->function = dl_task_timer;
18296 +       timer->irqsafe = 1;
18297  }
18298
18299  static
18300 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
18301 index fa178b62ea79..935224123441 100644
18302 --- a/kernel/sched/debug.c
18303 +++ b/kernel/sched/debug.c
18304 @@ -558,6 +558,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
18305         P(rt_throttled);
18306         PN(rt_time);
18307         PN(rt_runtime);
18308 +#ifdef CONFIG_SMP
18309 +       P(rt_nr_migratory);
18310 +#endif
18311
18312  #undef PN
18313  #undef P
18314 @@ -953,6 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
18315  #endif
18316         P(policy);
18317         P(prio);
18318 +#ifdef CONFIG_PREEMPT_RT_FULL
18319 +       P(migrate_disable);
18320 +#endif
18321 +       P(nr_cpus_allowed);
18322  #undef PN_SCHEDSTAT
18323  #undef PN
18324  #undef __PN
18325 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
18326 index c242944f5cbd..4aeb2e2e41bc 100644
18327 --- a/kernel/sched/fair.c
18328 +++ b/kernel/sched/fair.c
18329 @@ -3518,7 +3518,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
18330         ideal_runtime = sched_slice(cfs_rq, curr);
18331         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
18332         if (delta_exec > ideal_runtime) {
18333 -               resched_curr(rq_of(cfs_rq));
18334 +               resched_curr_lazy(rq_of(cfs_rq));
18335                 /*
18336                  * The current task ran long enough, ensure it doesn't get
18337                  * re-elected due to buddy favours.
18338 @@ -3542,7 +3542,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
18339                 return;
18340
18341         if (delta > ideal_runtime)
18342 -               resched_curr(rq_of(cfs_rq));
18343 +               resched_curr_lazy(rq_of(cfs_rq));
18344  }
18345
18346  static void
18347 @@ -3684,7 +3684,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
18348          * validating it and just reschedule.
18349          */
18350         if (queued) {
18351 -               resched_curr(rq_of(cfs_rq));
18352 +               resched_curr_lazy(rq_of(cfs_rq));
18353                 return;
18354         }
18355         /*
18356 @@ -3866,7 +3866,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
18357          * hierarchy can be throttled
18358          */
18359         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
18360 -               resched_curr(rq_of(cfs_rq));
18361 +               resched_curr_lazy(rq_of(cfs_rq));
18362  }
18363
18364  static __always_inline
18365 @@ -4494,7 +4494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
18366
18367                 if (delta < 0) {
18368                         if (rq->curr == p)
18369 -                               resched_curr(rq);
18370 +                               resched_curr_lazy(rq);
18371                         return;
18372                 }
18373                 hrtick_start(rq, delta);
18374 @@ -5905,7 +5905,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
18375         return;
18376
18377  preempt:
18378 -       resched_curr(rq);
18379 +       resched_curr_lazy(rq);
18380         /*
18381          * Only set the backward buddy when the current task is still
18382          * on the rq. This can happen when a wakeup gets interleaved
18383 @@ -8631,7 +8631,7 @@ static void task_fork_fair(struct task_struct *p)
18384                  * 'current' within the tree based on its new key value.
18385                  */
18386                 swap(curr->vruntime, se->vruntime);
18387 -               resched_curr(rq);
18388 +               resched_curr_lazy(rq);
18389         }
18390
18391         se->vruntime -= cfs_rq->min_vruntime;
18392 @@ -8655,7 +8655,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
18393          */
18394         if (rq->curr == p) {
18395                 if (p->prio > oldprio)
18396 -                       resched_curr(rq);
18397 +                       resched_curr_lazy(rq);
18398         } else
18399                 check_preempt_curr(rq, p, 0);
18400  }
18401 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
18402 index 69631fa46c2f..6d28fcd08872 100644
18403 --- a/kernel/sched/features.h
18404 +++ b/kernel/sched/features.h
18405 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
18406   */
18407  SCHED_FEAT(NONTASK_CAPACITY, true)
18408
18409 +#ifdef CONFIG_PREEMPT_RT_FULL
18410 +SCHED_FEAT(TTWU_QUEUE, false)
18411 +# ifdef CONFIG_PREEMPT_LAZY
18412 +SCHED_FEAT(PREEMPT_LAZY, true)
18413 +# endif
18414 +#else
18415 +
18416  /*
18417   * Queue remote wakeups on the target CPU and process them
18418   * using the scheduler IPI. Reduces rq->lock contention/bounces.
18419   */
18420  SCHED_FEAT(TTWU_QUEUE, true)
18421 +#endif
18422
18423  #ifdef HAVE_RT_PUSH_IPI
18424  /*
18425 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
18426 index f139f22ce30d..b0691f4e7d49 100644
18427 --- a/kernel/sched/rt.c
18428 +++ b/kernel/sched/rt.c
18429 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
18430
18431         hrtimer_init(&rt_b->rt_period_timer,
18432                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
18433 +       rt_b->rt_period_timer.irqsafe = 1;
18434         rt_b->rt_period_timer.function = sched_rt_period_timer;
18435  }
18436
18437 @@ -101,6 +102,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
18438         rt_rq->push_cpu = nr_cpu_ids;
18439         raw_spin_lock_init(&rt_rq->push_lock);
18440         init_irq_work(&rt_rq->push_work, push_irq_work_func);
18441 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
18442  #endif
18443  #endif /* CONFIG_SMP */
18444         /* We start is dequeued state, because no RT tasks are queued */
18445 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
18446 index 055f935d4421..19324ac27026 100644
18447 --- a/kernel/sched/sched.h
18448 +++ b/kernel/sched/sched.h
18449 @@ -1163,6 +1163,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
18450  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
18451  #define WF_FORK                0x02            /* child wakeup after fork */
18452  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
18453 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
18454
18455  /*
18456   * To aid in avoiding the subversion of "niceness" due to uneven distribution
18457 @@ -1346,6 +1347,15 @@ extern void init_sched_fair_class(void);
18458  extern void resched_curr(struct rq *rq);
18459  extern void resched_cpu(int cpu);
18460
18461 +#ifdef CONFIG_PREEMPT_LAZY
18462 +extern void resched_curr_lazy(struct rq *rq);
18463 +#else
18464 +static inline void resched_curr_lazy(struct rq *rq)
18465 +{
18466 +       resched_curr(rq);
18467 +}
18468 +#endif
18469 +
18470  extern struct rt_bandwidth def_rt_bandwidth;
18471  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
18472
18473 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
18474 index 82f0dff90030..ef027ff3250a 100644
18475 --- a/kernel/sched/swait.c
18476 +++ b/kernel/sched/swait.c
18477 @@ -1,5 +1,6 @@
18478  #include <linux/sched.h>
18479  #include <linux/swait.h>
18480 +#include <linux/suspend.h>
18481
18482  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
18483                              struct lock_class_key *key)
18484 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
18485  }
18486  EXPORT_SYMBOL(swake_up_locked);
18487
18488 +void swake_up_all_locked(struct swait_queue_head *q)
18489 +{
18490 +       struct swait_queue *curr;
18491 +       int wakes = 0;
18492 +
18493 +       while (!list_empty(&q->task_list)) {
18494 +
18495 +               curr = list_first_entry(&q->task_list, typeof(*curr),
18496 +                                       task_list);
18497 +               wake_up_process(curr->task);
18498 +               list_del_init(&curr->task_list);
18499 +               wakes++;
18500 +       }
18501 +       if (pm_in_action)
18502 +               return;
18503 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
18504 +}
18505 +EXPORT_SYMBOL(swake_up_all_locked);
18506 +
18507  void swake_up(struct swait_queue_head *q)
18508  {
18509         unsigned long flags;
18510 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
18511         if (!swait_active(q))
18512                 return;
18513
18514 +       WARN_ON(irqs_disabled());
18515         raw_spin_lock_irq(&q->lock);
18516         list_splice_init(&q->task_list, &tmp);
18517         while (!list_empty(&tmp)) {
18518 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
18519 new file mode 100644
18520 index 000000000000..1950f40ca725
18521 --- /dev/null
18522 +++ b/kernel/sched/swork.c
18523 @@ -0,0 +1,173 @@
18524 +/*
18525 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
18526 + *
18527 + * Provides a framework for enqueuing callbacks from irq context
18528 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
18529 + */
18530 +
18531 +#include <linux/swait.h>
18532 +#include <linux/swork.h>
18533 +#include <linux/kthread.h>
18534 +#include <linux/slab.h>
18535 +#include <linux/spinlock.h>
18536 +#include <linux/export.h>
18537 +
18538 +#define SWORK_EVENT_PENDING     (1 << 0)
18539 +
18540 +static DEFINE_MUTEX(worker_mutex);
18541 +static struct sworker *glob_worker;
18542 +
18543 +struct sworker {
18544 +       struct list_head events;
18545 +       struct swait_queue_head wq;
18546 +
18547 +       raw_spinlock_t lock;
18548 +
18549 +       struct task_struct *task;
18550 +       int refs;
18551 +};
18552 +
18553 +static bool swork_readable(struct sworker *worker)
18554 +{
18555 +       bool r;
18556 +
18557 +       if (kthread_should_stop())
18558 +               return true;
18559 +
18560 +       raw_spin_lock_irq(&worker->lock);
18561 +       r = !list_empty(&worker->events);
18562 +       raw_spin_unlock_irq(&worker->lock);
18563 +
18564 +       return r;
18565 +}
18566 +
18567 +static int swork_kthread(void *arg)
18568 +{
18569 +       struct sworker *worker = arg;
18570 +
18571 +       for (;;) {
18572 +               swait_event_interruptible(worker->wq,
18573 +                                       swork_readable(worker));
18574 +               if (kthread_should_stop())
18575 +                       break;
18576 +
18577 +               raw_spin_lock_irq(&worker->lock);
18578 +               while (!list_empty(&worker->events)) {
18579 +                       struct swork_event *sev;
18580 +
18581 +                       sev = list_first_entry(&worker->events,
18582 +                                       struct swork_event, item);
18583 +                       list_del(&sev->item);
18584 +                       raw_spin_unlock_irq(&worker->lock);
18585 +
18586 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
18587 +                                                        &sev->flags));
18588 +                       sev->func(sev);
18589 +                       raw_spin_lock_irq(&worker->lock);
18590 +               }
18591 +               raw_spin_unlock_irq(&worker->lock);
18592 +       }
18593 +       return 0;
18594 +}
18595 +
18596 +static struct sworker *swork_create(void)
18597 +{
18598 +       struct sworker *worker;
18599 +
18600 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
18601 +       if (!worker)
18602 +               return ERR_PTR(-ENOMEM);
18603 +
18604 +       INIT_LIST_HEAD(&worker->events);
18605 +       raw_spin_lock_init(&worker->lock);
18606 +       init_swait_queue_head(&worker->wq);
18607 +
18608 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
18609 +       if (IS_ERR(worker->task)) {
18610 +               kfree(worker);
18611 +               return ERR_PTR(-ENOMEM);
18612 +       }
18613 +
18614 +       return worker;
18615 +}
18616 +
18617 +static void swork_destroy(struct sworker *worker)
18618 +{
18619 +       kthread_stop(worker->task);
18620 +
18621 +       WARN_ON(!list_empty(&worker->events));
18622 +       kfree(worker);
18623 +}
18624 +
18625 +/**
18626 + * swork_queue - queue swork
18627 + *
18628 + * Returns %false if @work was already on a queue, %true otherwise.
18629 + *
18630 + * The work is queued and processed on a random CPU
18631 + */
18632 +bool swork_queue(struct swork_event *sev)
18633 +{
18634 +       unsigned long flags;
18635 +
18636 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
18637 +               return false;
18638 +
18639 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
18640 +       list_add_tail(&sev->item, &glob_worker->events);
18641 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
18642 +
18643 +       swake_up(&glob_worker->wq);
18644 +       return true;
18645 +}
18646 +EXPORT_SYMBOL_GPL(swork_queue);
18647 +
18648 +/**
18649 + * swork_get - get an instance of the sworker
18650 + *
18651 + * Returns an negative error code if the initialization if the worker did not
18652 + * work, %0 otherwise.
18653 + *
18654 + */
18655 +int swork_get(void)
18656 +{
18657 +       struct sworker *worker;
18658 +
18659 +       mutex_lock(&worker_mutex);
18660 +       if (!glob_worker) {
18661 +               worker = swork_create();
18662 +               if (IS_ERR(worker)) {
18663 +                       mutex_unlock(&worker_mutex);
18664 +                       return -ENOMEM;
18665 +               }
18666 +
18667 +               glob_worker = worker;
18668 +       }
18669 +
18670 +       glob_worker->refs++;
18671 +       mutex_unlock(&worker_mutex);
18672 +
18673 +       return 0;
18674 +}
18675 +EXPORT_SYMBOL_GPL(swork_get);
18676 +
18677 +/**
18678 + * swork_put - puts an instance of the sworker
18679 + *
18680 + * Will destroy the sworker thread. This function must not be called until all
18681 + * queued events have been completed.
18682 + */
18683 +void swork_put(void)
18684 +{
18685 +       mutex_lock(&worker_mutex);
18686 +
18687 +       glob_worker->refs--;
18688 +       if (glob_worker->refs > 0)
18689 +               goto out;
18690 +
18691 +       swork_destroy(glob_worker);
18692 +       glob_worker = NULL;
18693 +out:
18694 +       mutex_unlock(&worker_mutex);
18695 +}
18696 +EXPORT_SYMBOL_GPL(swork_put);
18697 diff --git a/kernel/signal.c b/kernel/signal.c
18698 index 0b1415720a15..c884647951f7 100644
18699 --- a/kernel/signal.c
18700 +++ b/kernel/signal.c
18701 @@ -14,6 +14,7 @@
18702  #include <linux/export.h>
18703  #include <linux/init.h>
18704  #include <linux/sched.h>
18705 +#include <linux/sched/rt.h>
18706  #include <linux/fs.h>
18707  #include <linux/tty.h>
18708  #include <linux/binfmts.h>
18709 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
18710         return false;
18711  }
18712
18713 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
18714 +{
18715 +       struct sigqueue *q = t->sigqueue_cache;
18716 +
18717 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
18718 +               return NULL;
18719 +       return q;
18720 +}
18721 +
18722 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
18723 +{
18724 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
18725 +               return 0;
18726 +       return 1;
18727 +}
18728 +
18729  /*
18730   * allocate a new signal queue record
18731   * - this may be called without locks if and only if t == current, otherwise an
18732   *   appropriate lock must be held to stop the target task from exiting
18733   */
18734  static struct sigqueue *
18735 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
18736 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
18737 +                   int override_rlimit, int fromslab)
18738  {
18739         struct sigqueue *q = NULL;
18740         struct user_struct *user;
18741 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
18742         if (override_rlimit ||
18743             atomic_read(&user->sigpending) <=
18744                         task_rlimit(t, RLIMIT_SIGPENDING)) {
18745 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
18746 +               if (!fromslab)
18747 +                       q = get_task_cache(t);
18748 +               if (!q)
18749 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
18750         } else {
18751                 print_dropped_signal(sig);
18752         }
18753 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
18754         return q;
18755  }
18756
18757 +static struct sigqueue *
18758 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
18759 +                int override_rlimit)
18760 +{
18761 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
18762 +}
18763 +
18764  static void __sigqueue_free(struct sigqueue *q)
18765  {
18766         if (q->flags & SIGQUEUE_PREALLOC)
18767 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
18768         kmem_cache_free(sigqueue_cachep, q);
18769  }
18770
18771 +static void sigqueue_free_current(struct sigqueue *q)
18772 +{
18773 +       struct user_struct *up;
18774 +
18775 +       if (q->flags & SIGQUEUE_PREALLOC)
18776 +               return;
18777 +
18778 +       up = q->user;
18779 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
18780 +               atomic_dec(&up->sigpending);
18781 +               free_uid(up);
18782 +       } else
18783 +                 __sigqueue_free(q);
18784 +}
18785 +
18786  void flush_sigqueue(struct sigpending *queue)
18787  {
18788         struct sigqueue *q;
18789 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
18790  }
18791
18792  /*
18793 + * Called from __exit_signal. Flush tsk->pending and
18794 + * tsk->sigqueue_cache
18795 + */
18796 +void flush_task_sigqueue(struct task_struct *tsk)
18797 +{
18798 +       struct sigqueue *q;
18799 +
18800 +       flush_sigqueue(&tsk->pending);
18801 +
18802 +       q = get_task_cache(tsk);
18803 +       if (q)
18804 +               kmem_cache_free(sigqueue_cachep, q);
18805 +}
18806 +
18807 +/*
18808   * Flush all pending signals for this kthread.
18809   */
18810  void flush_signals(struct task_struct *t)
18811 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
18812  still_pending:
18813                 list_del_init(&first->list);
18814                 copy_siginfo(info, &first->info);
18815 -               __sigqueue_free(first);
18816 +               sigqueue_free_current(first);
18817         } else {
18818                 /*
18819                  * Ok, it wasn't in the queue.  This must be
18820 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
18821  {
18822         int signr;
18823
18824 +       WARN_ON_ONCE(tsk != current);
18825 +
18826         /* We only dequeue private signals from ourselves, we don't let
18827          * signalfd steal them
18828          */
18829 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
18830   * We don't want to have recursive SIGSEGV's etc, for example,
18831   * that is why we also clear SIGNAL_UNKILLABLE.
18832   */
18833 -int
18834 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18835 +static int
18836 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18837  {
18838         unsigned long int flags;
18839         int ret, blocked, ignored;
18840 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18841         return ret;
18842  }
18843
18844 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
18845 +{
18846 +/*
18847 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
18848 + * since it can not enable preemption, and the signal code's spin_locks
18849 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
18850 + * send the signal on exit of the trap.
18851 + */
18852 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
18853 +       if (in_atomic()) {
18854 +               if (WARN_ON_ONCE(t != current))
18855 +                       return 0;
18856 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
18857 +                       return 0;
18858 +
18859 +               if (is_si_special(info)) {
18860 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
18861 +                       t->forced_info.si_signo = sig;
18862 +                       t->forced_info.si_errno = 0;
18863 +                       t->forced_info.si_code = SI_KERNEL;
18864 +                       t->forced_info.si_pid = 0;
18865 +                       t->forced_info.si_uid = 0;
18866 +               } else {
18867 +                       t->forced_info = *info;
18868 +               }
18869 +
18870 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
18871 +               return 0;
18872 +       }
18873 +#endif
18874 +       return do_force_sig_info(sig, info, t);
18875 +}
18876 +
18877  /*
18878   * Nuke all other threads in the group.
18879   */
18880 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
18881                  * Disable interrupts early to avoid deadlocks.
18882                  * See rcu_read_unlock() comment header for details.
18883                  */
18884 -               local_irq_save(*flags);
18885 +               local_irq_save_nort(*flags);
18886                 rcu_read_lock();
18887                 sighand = rcu_dereference(tsk->sighand);
18888                 if (unlikely(sighand == NULL)) {
18889                         rcu_read_unlock();
18890 -                       local_irq_restore(*flags);
18891 +                       local_irq_restore_nort(*flags);
18892                         break;
18893                 }
18894                 /*
18895 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
18896                 }
18897                 spin_unlock(&sighand->siglock);
18898                 rcu_read_unlock();
18899 -               local_irq_restore(*flags);
18900 +               local_irq_restore_nort(*flags);
18901         }
18902
18903         return sighand;
18904 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
18905   */
18906  struct sigqueue *sigqueue_alloc(void)
18907  {
18908 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
18909 +       /* Preallocated sigqueue objects always from the slabcache ! */
18910 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
18911
18912         if (q)
18913                 q->flags |= SIGQUEUE_PREALLOC;
18914 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
18915                 if (gstop_done && ptrace_reparented(current))
18916                         do_notify_parent_cldstop(current, false, why);
18917
18918 -               /*
18919 -                * Don't want to allow preemption here, because
18920 -                * sys_ptrace() needs this task to be inactive.
18921 -                *
18922 -                * XXX: implement read_unlock_no_resched().
18923 -                */
18924 -               preempt_disable();
18925                 read_unlock(&tasklist_lock);
18926 -               preempt_enable_no_resched();
18927                 freezable_schedule();
18928         } else {
18929                 /*
18930 diff --git a/kernel/softirq.c b/kernel/softirq.c
18931 index 744fa611cae0..819bd7cf5ad0 100644
18932 --- a/kernel/softirq.c
18933 +++ b/kernel/softirq.c
18934 @@ -21,10 +21,12 @@
18935  #include <linux/freezer.h>
18936  #include <linux/kthread.h>
18937  #include <linux/rcupdate.h>
18938 +#include <linux/delay.h>
18939  #include <linux/ftrace.h>
18940  #include <linux/smp.h>
18941  #include <linux/smpboot.h>
18942  #include <linux/tick.h>
18943 +#include <linux/locallock.h>
18944  #include <linux/irq.h>
18945
18946  #define CREATE_TRACE_POINTS
18947 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
18948  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
18949
18950  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
18951 +#ifdef CONFIG_PREEMPT_RT_FULL
18952 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
18953 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
18954 +#endif
18955
18956  const char * const softirq_to_name[NR_SOFTIRQS] = {
18957         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
18958         "TASKLET", "SCHED", "HRTIMER", "RCU"
18959  };
18960
18961 +#ifdef CONFIG_NO_HZ_COMMON
18962 +# ifdef CONFIG_PREEMPT_RT_FULL
18963 +
18964 +struct softirq_runner {
18965 +       struct task_struct *runner[NR_SOFTIRQS];
18966 +};
18967 +
18968 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
18969 +
18970 +static inline void softirq_set_runner(unsigned int sirq)
18971 +{
18972 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18973 +
18974 +       sr->runner[sirq] = current;
18975 +}
18976 +
18977 +static inline void softirq_clr_runner(unsigned int sirq)
18978 +{
18979 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18980 +
18981 +       sr->runner[sirq] = NULL;
18982 +}
18983 +
18984 +/*
18985 + * On preempt-rt a softirq running context might be blocked on a
18986 + * lock. There might be no other runnable task on this CPU because the
18987 + * lock owner runs on some other CPU. So we have to go into idle with
18988 + * the pending bit set. Therefor we need to check this otherwise we
18989 + * warn about false positives which confuses users and defeats the
18990 + * whole purpose of this test.
18991 + *
18992 + * This code is called with interrupts disabled.
18993 + */
18994 +void softirq_check_pending_idle(void)
18995 +{
18996 +       static int rate_limit;
18997 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18998 +       u32 warnpending;
18999 +       int i;
19000 +
19001 +       if (rate_limit >= 10)
19002 +               return;
19003 +
19004 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
19005 +       for (i = 0; i < NR_SOFTIRQS; i++) {
19006 +               struct task_struct *tsk = sr->runner[i];
19007 +
19008 +               /*
19009 +                * The wakeup code in rtmutex.c wakes up the task
19010 +                * _before_ it sets pi_blocked_on to NULL under
19011 +                * tsk->pi_lock. So we need to check for both: state
19012 +                * and pi_blocked_on.
19013 +                */
19014 +               if (tsk) {
19015 +                       raw_spin_lock(&tsk->pi_lock);
19016 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
19017 +                               /* Clear all bits pending in that task */
19018 +                               warnpending &= ~(tsk->softirqs_raised);
19019 +                               warnpending &= ~(1 << i);
19020 +                       }
19021 +                       raw_spin_unlock(&tsk->pi_lock);
19022 +               }
19023 +       }
19024 +
19025 +       if (warnpending) {
19026 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
19027 +                      warnpending);
19028 +               rate_limit++;
19029 +       }
19030 +}
19031 +# else
19032 +/*
19033 + * On !PREEMPT_RT we just printk rate limited:
19034 + */
19035 +void softirq_check_pending_idle(void)
19036 +{
19037 +       static int rate_limit;
19038 +
19039 +       if (rate_limit < 10 &&
19040 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
19041 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
19042 +                      local_softirq_pending());
19043 +               rate_limit++;
19044 +       }
19045 +}
19046 +# endif
19047 +
19048 +#else /* !CONFIG_NO_HZ_COMMON */
19049 +static inline void softirq_set_runner(unsigned int sirq) { }
19050 +static inline void softirq_clr_runner(unsigned int sirq) { }
19051 +#endif
19052 +
19053  /*
19054   * we cannot loop indefinitely here to avoid userspace starvation,
19055   * but we also don't want to introduce a worst case 1/HZ latency
19056 @@ -77,6 +175,38 @@ static void wakeup_softirqd(void)
19057                 wake_up_process(tsk);
19058  }
19059
19060 +#ifdef CONFIG_PREEMPT_RT_FULL
19061 +static void wakeup_timer_softirqd(void)
19062 +{
19063 +       /* Interrupts are disabled: no need to stop preemption */
19064 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
19065 +
19066 +       if (tsk && tsk->state != TASK_RUNNING)
19067 +               wake_up_process(tsk);
19068 +}
19069 +#endif
19070 +
19071 +static void handle_softirq(unsigned int vec_nr)
19072 +{
19073 +       struct softirq_action *h = softirq_vec + vec_nr;
19074 +       int prev_count;
19075 +
19076 +       prev_count = preempt_count();
19077 +
19078 +       kstat_incr_softirqs_this_cpu(vec_nr);
19079 +
19080 +       trace_softirq_entry(vec_nr);
19081 +       h->action(h);
19082 +       trace_softirq_exit(vec_nr);
19083 +       if (unlikely(prev_count != preempt_count())) {
19084 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
19085 +                      vec_nr, softirq_to_name[vec_nr], h->action,
19086 +                      prev_count, preempt_count());
19087 +               preempt_count_set(prev_count);
19088 +       }
19089 +}
19090 +
19091 +#ifndef CONFIG_PREEMPT_RT_FULL
19092  /*
19093   * If ksoftirqd is scheduled, we do not want to process pending softirqs
19094   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
19095 @@ -88,6 +218,47 @@ static bool ksoftirqd_running(void)
19096         return tsk && (tsk->state == TASK_RUNNING);
19097  }
19098
19099 +static inline int ksoftirqd_softirq_pending(void)
19100 +{
19101 +       return local_softirq_pending();
19102 +}
19103 +
19104 +static void handle_pending_softirqs(u32 pending)
19105 +{
19106 +       struct softirq_action *h = softirq_vec;
19107 +       int softirq_bit;
19108 +
19109 +       local_irq_enable();
19110 +
19111 +       h = softirq_vec;
19112 +
19113 +       while ((softirq_bit = ffs(pending))) {
19114 +               unsigned int vec_nr;
19115 +
19116 +               h += softirq_bit - 1;
19117 +               vec_nr = h - softirq_vec;
19118 +               handle_softirq(vec_nr);
19119 +
19120 +               h++;
19121 +               pending >>= softirq_bit;
19122 +       }
19123 +
19124 +       rcu_bh_qs();
19125 +       local_irq_disable();
19126 +}
19127 +
19128 +static void run_ksoftirqd(unsigned int cpu)
19129 +{
19130 +       local_irq_disable();
19131 +       if (ksoftirqd_softirq_pending()) {
19132 +               __do_softirq();
19133 +               local_irq_enable();
19134 +               cond_resched_rcu_qs();
19135 +               return;
19136 +       }
19137 +       local_irq_enable();
19138 +}
19139 +
19140  /*
19141   * preempt_count and SOFTIRQ_OFFSET usage:
19142   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
19143 @@ -243,10 +414,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
19144         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
19145         unsigned long old_flags = current->flags;
19146         int max_restart = MAX_SOFTIRQ_RESTART;
19147 -       struct softirq_action *h;
19148         bool in_hardirq;
19149         __u32 pending;
19150 -       int softirq_bit;
19151
19152         /*
19153          * Mask out PF_MEMALLOC s current task context is borrowed for the
19154 @@ -265,36 +434,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
19155         /* Reset the pending bitmask before enabling irqs */
19156         set_softirq_pending(0);
19157
19158 -       local_irq_enable();
19159 -
19160 -       h = softirq_vec;
19161 -
19162 -       while ((softirq_bit = ffs(pending))) {
19163 -               unsigned int vec_nr;
19164 -               int prev_count;
19165 -
19166 -               h += softirq_bit - 1;
19167 -
19168 -               vec_nr = h - softirq_vec;
19169 -               prev_count = preempt_count();
19170 -
19171 -               kstat_incr_softirqs_this_cpu(vec_nr);
19172 -
19173 -               trace_softirq_entry(vec_nr);
19174 -               h->action(h);
19175 -               trace_softirq_exit(vec_nr);
19176 -               if (unlikely(prev_count != preempt_count())) {
19177 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
19178 -                              vec_nr, softirq_to_name[vec_nr], h->action,
19179 -                              prev_count, preempt_count());
19180 -                       preempt_count_set(prev_count);
19181 -               }
19182 -               h++;
19183 -               pending >>= softirq_bit;
19184 -       }
19185 -
19186 -       rcu_bh_qs();
19187 -       local_irq_disable();
19188 +       handle_pending_softirqs(pending);
19189
19190         pending = local_softirq_pending();
19191         if (pending) {
19192 @@ -331,6 +471,309 @@ asmlinkage __visible void do_softirq(void)
19193  }
19194
19195  /*
19196 + * This function must run with irqs disabled!
19197 + */
19198 +void raise_softirq_irqoff(unsigned int nr)
19199 +{
19200 +       __raise_softirq_irqoff(nr);
19201 +
19202 +       /*
19203 +        * If we're in an interrupt or softirq, we're done
19204 +        * (this also catches softirq-disabled code). We will
19205 +        * actually run the softirq once we return from
19206 +        * the irq or softirq.
19207 +        *
19208 +        * Otherwise we wake up ksoftirqd to make sure we
19209 +        * schedule the softirq soon.
19210 +        */
19211 +       if (!in_interrupt())
19212 +               wakeup_softirqd();
19213 +}
19214 +
19215 +void __raise_softirq_irqoff(unsigned int nr)
19216 +{
19217 +       trace_softirq_raise(nr);
19218 +       or_softirq_pending(1UL << nr);
19219 +}
19220 +
19221 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
19222 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
19223 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
19224 +
19225 +#else /* !PREEMPT_RT_FULL */
19226 +
19227 +/*
19228 + * On RT we serialize softirq execution with a cpu local lock per softirq
19229 + */
19230 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
19231 +
19232 +void __init softirq_early_init(void)
19233 +{
19234 +       int i;
19235 +
19236 +       for (i = 0; i < NR_SOFTIRQS; i++)
19237 +               local_irq_lock_init(local_softirq_locks[i]);
19238 +}
19239 +
19240 +static void lock_softirq(int which)
19241 +{
19242 +       local_lock(local_softirq_locks[which]);
19243 +}
19244 +
19245 +static void unlock_softirq(int which)
19246 +{
19247 +       local_unlock(local_softirq_locks[which]);
19248 +}
19249 +
19250 +static void do_single_softirq(int which)
19251 +{
19252 +       unsigned long old_flags = current->flags;
19253 +
19254 +       current->flags &= ~PF_MEMALLOC;
19255 +       vtime_account_irq_enter(current);
19256 +       current->flags |= PF_IN_SOFTIRQ;
19257 +       lockdep_softirq_enter();
19258 +       local_irq_enable();
19259 +       handle_softirq(which);
19260 +       local_irq_disable();
19261 +       lockdep_softirq_exit();
19262 +       current->flags &= ~PF_IN_SOFTIRQ;
19263 +       vtime_account_irq_enter(current);
19264 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
19265 +}
19266 +
19267 +/*
19268 + * Called with interrupts disabled. Process softirqs which were raised
19269 + * in current context (or on behalf of ksoftirqd).
19270 + */
19271 +static void do_current_softirqs(void)
19272 +{
19273 +       while (current->softirqs_raised) {
19274 +               int i = __ffs(current->softirqs_raised);
19275 +               unsigned int pending, mask = (1U << i);
19276 +
19277 +               current->softirqs_raised &= ~mask;
19278 +               local_irq_enable();
19279 +
19280 +               /*
19281 +                * If the lock is contended, we boost the owner to
19282 +                * process the softirq or leave the critical section
19283 +                * now.
19284 +                */
19285 +               lock_softirq(i);
19286 +               local_irq_disable();
19287 +               softirq_set_runner(i);
19288 +               /*
19289 +                * Check with the local_softirq_pending() bits,
19290 +                * whether we need to process this still or if someone
19291 +                * else took care of it.
19292 +                */
19293 +               pending = local_softirq_pending();
19294 +               if (pending & mask) {
19295 +                       set_softirq_pending(pending & ~mask);
19296 +                       do_single_softirq(i);
19297 +               }
19298 +               softirq_clr_runner(i);
19299 +               WARN_ON(current->softirq_nestcnt != 1);
19300 +               local_irq_enable();
19301 +               unlock_softirq(i);
19302 +               local_irq_disable();
19303 +       }
19304 +}
19305 +
19306 +void __local_bh_disable(void)
19307 +{
19308 +       if (++current->softirq_nestcnt == 1)
19309 +               migrate_disable();
19310 +}
19311 +EXPORT_SYMBOL(__local_bh_disable);
19312 +
19313 +void __local_bh_enable(void)
19314 +{
19315 +       if (WARN_ON(current->softirq_nestcnt == 0))
19316 +               return;
19317 +
19318 +       local_irq_disable();
19319 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
19320 +               do_current_softirqs();
19321 +       local_irq_enable();
19322 +
19323 +       if (--current->softirq_nestcnt == 0)
19324 +               migrate_enable();
19325 +}
19326 +EXPORT_SYMBOL(__local_bh_enable);
19327 +
19328 +void _local_bh_enable(void)
19329 +{
19330 +       if (WARN_ON(current->softirq_nestcnt == 0))
19331 +               return;
19332 +       if (--current->softirq_nestcnt == 0)
19333 +               migrate_enable();
19334 +}
19335 +EXPORT_SYMBOL(_local_bh_enable);
19336 +
19337 +int in_serving_softirq(void)
19338 +{
19339 +       return current->flags & PF_IN_SOFTIRQ;
19340 +}
19341 +EXPORT_SYMBOL(in_serving_softirq);
19342 +
19343 +/* Called with preemption disabled */
19344 +static void run_ksoftirqd(unsigned int cpu)
19345 +{
19346 +       local_irq_disable();
19347 +       current->softirq_nestcnt++;
19348 +
19349 +       do_current_softirqs();
19350 +       current->softirq_nestcnt--;
19351 +       local_irq_enable();
19352 +       cond_resched_rcu_qs();
19353 +}
19354 +
19355 +/*
19356 + * Called from netif_rx_ni(). Preemption enabled, but migration
19357 + * disabled. So the cpu can't go away under us.
19358 + */
19359 +void thread_do_softirq(void)
19360 +{
19361 +       if (!in_serving_softirq() && current->softirqs_raised) {
19362 +               current->softirq_nestcnt++;
19363 +               do_current_softirqs();
19364 +               current->softirq_nestcnt--;
19365 +       }
19366 +}
19367 +
19368 +static void do_raise_softirq_irqoff(unsigned int nr)
19369 +{
19370 +       unsigned int mask;
19371 +
19372 +       mask = 1UL << nr;
19373 +
19374 +       trace_softirq_raise(nr);
19375 +       or_softirq_pending(mask);
19376 +
19377 +       /*
19378 +        * If we are not in a hard interrupt and inside a bh disabled
19379 +        * region, we simply raise the flag on current. local_bh_enable()
19380 +        * will make sure that the softirq is executed. Otherwise we
19381 +        * delegate it to ksoftirqd.
19382 +        */
19383 +       if (!in_irq() && current->softirq_nestcnt)
19384 +               current->softirqs_raised |= mask;
19385 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
19386 +               return;
19387 +
19388 +       if (mask & TIMER_SOFTIRQS)
19389 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
19390 +       else
19391 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
19392 +}
19393 +
19394 +static void wakeup_proper_softirq(unsigned int nr)
19395 +{
19396 +       if ((1UL << nr) & TIMER_SOFTIRQS)
19397 +               wakeup_timer_softirqd();
19398 +       else
19399 +               wakeup_softirqd();
19400 +}
19401 +
19402 +void __raise_softirq_irqoff(unsigned int nr)
19403 +{
19404 +       do_raise_softirq_irqoff(nr);
19405 +       if (!in_irq() && !current->softirq_nestcnt)
19406 +               wakeup_proper_softirq(nr);
19407 +}
19408 +
19409 +/*
19410 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
19411 + */
19412 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
19413 +{
19414 +       unsigned int mask;
19415 +
19416 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
19417 +                        !__this_cpu_read(ktimer_softirqd)))
19418 +               return;
19419 +       mask = 1UL << nr;
19420 +
19421 +       trace_softirq_raise(nr);
19422 +       or_softirq_pending(mask);
19423 +       if (mask & TIMER_SOFTIRQS)
19424 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
19425 +       else
19426 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
19427 +       wakeup_proper_softirq(nr);
19428 +}
19429 +
19430 +/*
19431 + * This function must run with irqs disabled!
19432 + */
19433 +void raise_softirq_irqoff(unsigned int nr)
19434 +{
19435 +       do_raise_softirq_irqoff(nr);
19436 +
19437 +       /*
19438 +        * If we're in an hard interrupt we let irq return code deal
19439 +        * with the wakeup of ksoftirqd.
19440 +        */
19441 +       if (in_irq())
19442 +               return;
19443 +       /*
19444 +        * If we are in thread context but outside of a bh disabled
19445 +        * region, we need to wake ksoftirqd as well.
19446 +        *
19447 +        * CHECKME: Some of the places which do that could be wrapped
19448 +        * into local_bh_disable/enable pairs. Though it's unclear
19449 +        * whether this is worth the effort. To find those places just
19450 +        * raise a WARN() if the condition is met.
19451 +        */
19452 +       if (!current->softirq_nestcnt)
19453 +               wakeup_proper_softirq(nr);
19454 +}
19455 +
19456 +static inline int ksoftirqd_softirq_pending(void)
19457 +{
19458 +       return current->softirqs_raised;
19459 +}
19460 +
19461 +static inline void local_bh_disable_nort(void) { }
19462 +static inline void _local_bh_enable_nort(void) { }
19463 +
19464 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
19465 +{
19466 +       /* Take over all but timer pending softirqs when starting */
19467 +       local_irq_disable();
19468 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
19469 +       local_irq_enable();
19470 +}
19471 +
19472 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
19473 +{
19474 +       struct sched_param param = { .sched_priority = 1 };
19475 +
19476 +       sched_setscheduler(current, SCHED_FIFO, &param);
19477 +
19478 +       /* Take over timer pending softirqs when starting */
19479 +       local_irq_disable();
19480 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
19481 +       local_irq_enable();
19482 +}
19483 +
19484 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
19485 +                                                   bool online)
19486 +{
19487 +       struct sched_param param = { .sched_priority = 0 };
19488 +
19489 +       sched_setscheduler(current, SCHED_NORMAL, &param);
19490 +}
19491 +
19492 +static int ktimer_softirqd_should_run(unsigned int cpu)
19493 +{
19494 +       return current->softirqs_raised;
19495 +}
19496 +
19497 +#endif /* PREEMPT_RT_FULL */
19498 +/*
19499   * Enter an interrupt context.
19500   */
19501  void irq_enter(void)
19502 @@ -341,9 +784,9 @@ void irq_enter(void)
19503                  * Prevent raise_softirq from needlessly waking up ksoftirqd
19504                  * here, as softirq will be serviced on return from interrupt.
19505                  */
19506 -               local_bh_disable();
19507 +               local_bh_disable_nort();
19508                 tick_irq_enter();
19509 -               _local_bh_enable();
19510 +               _local_bh_enable_nort();
19511         }
19512
19513         __irq_enter();
19514 @@ -351,6 +794,7 @@ void irq_enter(void)
19515
19516  static inline void invoke_softirq(void)
19517  {
19518 +#ifndef CONFIG_PREEMPT_RT_FULL
19519         if (ksoftirqd_running())
19520                 return;
19521
19522 @@ -373,6 +817,18 @@ static inline void invoke_softirq(void)
19523         } else {
19524                 wakeup_softirqd();
19525         }
19526 +#else /* PREEMPT_RT_FULL */
19527 +       unsigned long flags;
19528 +
19529 +       local_irq_save(flags);
19530 +       if (__this_cpu_read(ksoftirqd) &&
19531 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
19532 +               wakeup_softirqd();
19533 +       if (__this_cpu_read(ktimer_softirqd) &&
19534 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
19535 +               wakeup_timer_softirqd();
19536 +       local_irq_restore(flags);
19537 +#endif
19538  }
19539
19540  static inline void tick_irq_exit(void)
19541 @@ -409,26 +865,6 @@ void irq_exit(void)
19542         trace_hardirq_exit(); /* must be last! */
19543  }
19544
19545 -/*
19546 - * This function must run with irqs disabled!
19547 - */
19548 -inline void raise_softirq_irqoff(unsigned int nr)
19549 -{
19550 -       __raise_softirq_irqoff(nr);
19551 -
19552 -       /*
19553 -        * If we're in an interrupt or softirq, we're done
19554 -        * (this also catches softirq-disabled code). We will
19555 -        * actually run the softirq once we return from
19556 -        * the irq or softirq.
19557 -        *
19558 -        * Otherwise we wake up ksoftirqd to make sure we
19559 -        * schedule the softirq soon.
19560 -        */
19561 -       if (!in_interrupt())
19562 -               wakeup_softirqd();
19563 -}
19564 -
19565  void raise_softirq(unsigned int nr)
19566  {
19567         unsigned long flags;
19568 @@ -438,12 +874,6 @@ void raise_softirq(unsigned int nr)
19569         local_irq_restore(flags);
19570  }
19571
19572 -void __raise_softirq_irqoff(unsigned int nr)
19573 -{
19574 -       trace_softirq_raise(nr);
19575 -       or_softirq_pending(1UL << nr);
19576 -}
19577 -
19578  void open_softirq(int nr, void (*action)(struct softirq_action *))
19579  {
19580         softirq_vec[nr].action = action;
19581 @@ -460,15 +890,45 @@ struct tasklet_head {
19582  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
19583  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
19584
19585 +static void inline
19586 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
19587 +{
19588 +       if (tasklet_trylock(t)) {
19589 +again:
19590 +               /* We may have been preempted before tasklet_trylock
19591 +                * and __tasklet_action may have already run.
19592 +                * So double check the sched bit while the takslet
19593 +                * is locked before adding it to the list.
19594 +                */
19595 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
19596 +                       t->next = NULL;
19597 +                       *head->tail = t;
19598 +                       head->tail = &(t->next);
19599 +                       raise_softirq_irqoff(nr);
19600 +                       tasklet_unlock(t);
19601 +               } else {
19602 +                       /* This is subtle. If we hit the corner case above
19603 +                        * It is possible that we get preempted right here,
19604 +                        * and another task has successfully called
19605 +                        * tasklet_schedule(), then this function, and
19606 +                        * failed on the trylock. Thus we must be sure
19607 +                        * before releasing the tasklet lock, that the
19608 +                        * SCHED_BIT is clear. Otherwise the tasklet
19609 +                        * may get its SCHED_BIT set, but not added to the
19610 +                        * list
19611 +                        */
19612 +                       if (!tasklet_tryunlock(t))
19613 +                               goto again;
19614 +               }
19615 +       }
19616 +}
19617 +
19618  void __tasklet_schedule(struct tasklet_struct *t)
19619  {
19620         unsigned long flags;
19621
19622         local_irq_save(flags);
19623 -       t->next = NULL;
19624 -       *__this_cpu_read(tasklet_vec.tail) = t;
19625 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
19626 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
19627 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
19628         local_irq_restore(flags);
19629  }
19630  EXPORT_SYMBOL(__tasklet_schedule);
19631 @@ -478,10 +938,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
19632         unsigned long flags;
19633
19634         local_irq_save(flags);
19635 -       t->next = NULL;
19636 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
19637 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
19638 -       raise_softirq_irqoff(HI_SOFTIRQ);
19639 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
19640         local_irq_restore(flags);
19641  }
19642  EXPORT_SYMBOL(__tasklet_hi_schedule);
19643 @@ -490,82 +947,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
19644  {
19645         BUG_ON(!irqs_disabled());
19646
19647 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
19648 -       __this_cpu_write(tasklet_hi_vec.head, t);
19649 -       __raise_softirq_irqoff(HI_SOFTIRQ);
19650 +       __tasklet_hi_schedule(t);
19651  }
19652  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
19653
19654 -static __latent_entropy void tasklet_action(struct softirq_action *a)
19655 +void  tasklet_enable(struct tasklet_struct *t)
19656  {
19657 -       struct tasklet_struct *list;
19658 +       if (!atomic_dec_and_test(&t->count))
19659 +               return;
19660 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
19661 +               tasklet_schedule(t);
19662 +}
19663 +EXPORT_SYMBOL(tasklet_enable);
19664
19665 -       local_irq_disable();
19666 -       list = __this_cpu_read(tasklet_vec.head);
19667 -       __this_cpu_write(tasklet_vec.head, NULL);
19668 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
19669 -       local_irq_enable();
19670 +static void __tasklet_action(struct softirq_action *a,
19671 +                            struct tasklet_struct *list)
19672 +{
19673 +       int loops = 1000000;
19674
19675         while (list) {
19676                 struct tasklet_struct *t = list;
19677
19678                 list = list->next;
19679
19680 -               if (tasklet_trylock(t)) {
19681 -                       if (!atomic_read(&t->count)) {
19682 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
19683 -                                                       &t->state))
19684 -                                       BUG();
19685 -                               t->func(t->data);
19686 -                               tasklet_unlock(t);
19687 -                               continue;
19688 -                       }
19689 -                       tasklet_unlock(t);
19690 +               /*
19691 +                * Should always succeed - after a tasklist got on the
19692 +                * list (after getting the SCHED bit set from 0 to 1),
19693 +                * nothing but the tasklet softirq it got queued to can
19694 +                * lock it:
19695 +                */
19696 +               if (!tasklet_trylock(t)) {
19697 +                       WARN_ON(1);
19698 +                       continue;
19699                 }
19700
19701 -               local_irq_disable();
19702                 t->next = NULL;
19703 -               *__this_cpu_read(tasklet_vec.tail) = t;
19704 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
19705 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
19706 -               local_irq_enable();
19707 +
19708 +               /*
19709 +                * If we cannot handle the tasklet because it's disabled,
19710 +                * mark it as pending. tasklet_enable() will later
19711 +                * re-schedule the tasklet.
19712 +                */
19713 +               if (unlikely(atomic_read(&t->count))) {
19714 +out_disabled:
19715 +                       /* implicit unlock: */
19716 +                       wmb();
19717 +                       t->state = TASKLET_STATEF_PENDING;
19718 +                       continue;
19719 +               }
19720 +
19721 +               /*
19722 +                * After this point on the tasklet might be rescheduled
19723 +                * on another CPU, but it can only be added to another
19724 +                * CPU's tasklet list if we unlock the tasklet (which we
19725 +                * dont do yet).
19726 +                */
19727 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
19728 +                       WARN_ON(1);
19729 +
19730 +again:
19731 +               t->func(t->data);
19732 +
19733 +               /*
19734 +                * Try to unlock the tasklet. We must use cmpxchg, because
19735 +                * another CPU might have scheduled or disabled the tasklet.
19736 +                * We only allow the STATE_RUN -> 0 transition here.
19737 +                */
19738 +               while (!tasklet_tryunlock(t)) {
19739 +                       /*
19740 +                        * If it got disabled meanwhile, bail out:
19741 +                        */
19742 +                       if (atomic_read(&t->count))
19743 +                               goto out_disabled;
19744 +                       /*
19745 +                        * If it got scheduled meanwhile, re-execute
19746 +                        * the tasklet function:
19747 +                        */
19748 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
19749 +                               goto again;
19750 +                       if (!--loops) {
19751 +                               printk("hm, tasklet state: %08lx\n", t->state);
19752 +                               WARN_ON(1);
19753 +                               tasklet_unlock(t);
19754 +                               break;
19755 +                       }
19756 +               }
19757         }
19758  }
19759
19760 +static void tasklet_action(struct softirq_action *a)
19761 +{
19762 +       struct tasklet_struct *list;
19763 +
19764 +       local_irq_disable();
19765 +
19766 +       list = __this_cpu_read(tasklet_vec.head);
19767 +       __this_cpu_write(tasklet_vec.head, NULL);
19768 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
19769 +
19770 +       local_irq_enable();
19771 +
19772 +       __tasklet_action(a, list);
19773 +}
19774 +
19775  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
19776  {
19777         struct tasklet_struct *list;
19778
19779         local_irq_disable();
19780 +
19781         list = __this_cpu_read(tasklet_hi_vec.head);
19782         __this_cpu_write(tasklet_hi_vec.head, NULL);
19783         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
19784 +
19785         local_irq_enable();
19786
19787 -       while (list) {
19788 -               struct tasklet_struct *t = list;
19789 -
19790 -               list = list->next;
19791 -
19792 -               if (tasklet_trylock(t)) {
19793 -                       if (!atomic_read(&t->count)) {
19794 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
19795 -                                                       &t->state))
19796 -                                       BUG();
19797 -                               t->func(t->data);
19798 -                               tasklet_unlock(t);
19799 -                               continue;
19800 -                       }
19801 -                       tasklet_unlock(t);
19802 -               }
19803 -
19804 -               local_irq_disable();
19805 -               t->next = NULL;
19806 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
19807 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
19808 -               __raise_softirq_irqoff(HI_SOFTIRQ);
19809 -               local_irq_enable();
19810 -       }
19811 +       __tasklet_action(a, list);
19812  }
19813
19814  void tasklet_init(struct tasklet_struct *t,
19815 @@ -586,7 +1083,7 @@ void tasklet_kill(struct tasklet_struct *t)
19816
19817         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
19818                 do {
19819 -                       yield();
19820 +                       msleep(1);
19821                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
19822         }
19823         tasklet_unlock_wait(t);
19824 @@ -660,25 +1157,26 @@ void __init softirq_init(void)
19825         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
19826  }
19827
19828 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
19829 +void tasklet_unlock_wait(struct tasklet_struct *t)
19830 +{
19831 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
19832 +               /*
19833 +                * Hack for now to avoid this busy-loop:
19834 +                */
19835 +#ifdef CONFIG_PREEMPT_RT_FULL
19836 +               msleep(1);
19837 +#else
19838 +               barrier();
19839 +#endif
19840 +       }
19841 +}
19842 +EXPORT_SYMBOL(tasklet_unlock_wait);
19843 +#endif
19844 +
19845  static int ksoftirqd_should_run(unsigned int cpu)
19846  {
19847 -       return local_softirq_pending();
19848 -}
19849 -
19850 -static void run_ksoftirqd(unsigned int cpu)
19851 -{
19852 -       local_irq_disable();
19853 -       if (local_softirq_pending()) {
19854 -               /*
19855 -                * We can safely run softirq on inline stack, as we are not deep
19856 -                * in the task stack here.
19857 -                */
19858 -               __do_softirq();
19859 -               local_irq_enable();
19860 -               cond_resched_rcu_qs();
19861 -               return;
19862 -       }
19863 -       local_irq_enable();
19864 +       return ksoftirqd_softirq_pending();
19865  }
19866
19867  #ifdef CONFIG_HOTPLUG_CPU
19868 @@ -745,17 +1243,31 @@ static int takeover_tasklets(unsigned int cpu)
19869
19870  static struct smp_hotplug_thread softirq_threads = {
19871         .store                  = &ksoftirqd,
19872 +       .setup                  = ksoftirqd_set_sched_params,
19873         .thread_should_run      = ksoftirqd_should_run,
19874         .thread_fn              = run_ksoftirqd,
19875         .thread_comm            = "ksoftirqd/%u",
19876  };
19877
19878 +#ifdef CONFIG_PREEMPT_RT_FULL
19879 +static struct smp_hotplug_thread softirq_timer_threads = {
19880 +       .store                  = &ktimer_softirqd,
19881 +       .setup                  = ktimer_softirqd_set_sched_params,
19882 +       .cleanup                = ktimer_softirqd_clr_sched_params,
19883 +       .thread_should_run      = ktimer_softirqd_should_run,
19884 +       .thread_fn              = run_ksoftirqd,
19885 +       .thread_comm            = "ktimersoftd/%u",
19886 +};
19887 +#endif
19888 +
19889  static __init int spawn_ksoftirqd(void)
19890  {
19891         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
19892                                   takeover_tasklets);
19893         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
19894 -
19895 +#ifdef CONFIG_PREEMPT_RT_FULL
19896 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
19897 +#endif
19898         return 0;
19899  }
19900  early_initcall(spawn_ksoftirqd);
19901 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
19902 index ec9ab2f01489..8b89dbedeaff 100644
19903 --- a/kernel/stop_machine.c
19904 +++ b/kernel/stop_machine.c
19905 @@ -36,7 +36,7 @@ struct cpu_stop_done {
19906  struct cpu_stopper {
19907         struct task_struct      *thread;
19908
19909 -       spinlock_t              lock;
19910 +       raw_spinlock_t          lock;
19911         bool                    enabled;        /* is this stopper enabled? */
19912         struct list_head        works;          /* list of pending works */
19913
19914 @@ -78,14 +78,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
19915         unsigned long flags;
19916         bool enabled;
19917
19918 -       spin_lock_irqsave(&stopper->lock, flags);
19919 +       raw_spin_lock_irqsave(&stopper->lock, flags);
19920         enabled = stopper->enabled;
19921         if (enabled)
19922                 __cpu_stop_queue_work(stopper, work);
19923         else if (work->done)
19924                 cpu_stop_signal_done(work->done);
19925 -       spin_unlock_irqrestore(&stopper->lock, flags);
19926
19927 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
19928         return enabled;
19929  }
19930
19931 @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
19932         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
19933         int err;
19934  retry:
19935 -       spin_lock_irq(&stopper1->lock);
19936 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
19937 +       raw_spin_lock_irq(&stopper1->lock);
19938 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
19939
19940         err = -ENOENT;
19941         if (!stopper1->enabled || !stopper2->enabled)
19942 @@ -255,8 +255,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
19943         __cpu_stop_queue_work(stopper1, work1);
19944         __cpu_stop_queue_work(stopper2, work2);
19945  unlock:
19946 -       spin_unlock(&stopper2->lock);
19947 -       spin_unlock_irq(&stopper1->lock);
19948 +       raw_spin_unlock(&stopper2->lock);
19949 +       raw_spin_unlock_irq(&stopper1->lock);
19950
19951         if (unlikely(err == -EDEADLK)) {
19952                 while (stop_cpus_in_progress)
19953 @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
19954         unsigned long flags;
19955         int run;
19956
19957 -       spin_lock_irqsave(&stopper->lock, flags);
19958 +       raw_spin_lock_irqsave(&stopper->lock, flags);
19959         run = !list_empty(&stopper->works);
19960 -       spin_unlock_irqrestore(&stopper->lock, flags);
19961 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
19962         return run;
19963  }
19964
19965 @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
19966
19967  repeat:
19968         work = NULL;
19969 -       spin_lock_irq(&stopper->lock);
19970 +       raw_spin_lock_irq(&stopper->lock);
19971         if (!list_empty(&stopper->works)) {
19972                 work = list_first_entry(&stopper->works,
19973                                         struct cpu_stop_work, list);
19974                 list_del_init(&work->list);
19975         }
19976 -       spin_unlock_irq(&stopper->lock);
19977 +       raw_spin_unlock_irq(&stopper->lock);
19978
19979         if (work) {
19980                 cpu_stop_fn_t fn = work->fn;
19981 @@ -475,6 +475,8 @@ static void cpu_stopper_thread(unsigned int cpu)
19982                 struct cpu_stop_done *done = work->done;
19983                 int ret;
19984
19985 +               /* XXX */
19986 +
19987                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
19988                 preempt_count_inc();
19989                 ret = fn(arg);
19990 @@ -541,7 +543,7 @@ static int __init cpu_stop_init(void)
19991         for_each_possible_cpu(cpu) {
19992                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
19993
19994 -               spin_lock_init(&stopper->lock);
19995 +               raw_spin_lock_init(&stopper->lock);
19996                 INIT_LIST_HEAD(&stopper->works);
19997         }
19998
19999 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
20000 index bb5ec425dfe0..8338b14ed3a3 100644
20001 --- a/kernel/time/hrtimer.c
20002 +++ b/kernel/time/hrtimer.c
20003 @@ -53,6 +53,7 @@
20004  #include <asm/uaccess.h>
20005
20006  #include <trace/events/timer.h>
20007 +#include <trace/events/hist.h>
20008
20009  #include "tick-internal.h"
20010
20011 @@ -695,6 +696,29 @@ static void hrtimer_switch_to_hres(void)
20012         retrigger_next_event(NULL);
20013  }
20014
20015 +#ifdef CONFIG_PREEMPT_RT_FULL
20016 +
20017 +static struct swork_event clock_set_delay_work;
20018 +
20019 +static void run_clock_set_delay(struct swork_event *event)
20020 +{
20021 +       clock_was_set();
20022 +}
20023 +
20024 +void clock_was_set_delayed(void)
20025 +{
20026 +       swork_queue(&clock_set_delay_work);
20027 +}
20028 +
20029 +static __init int create_clock_set_delay_thread(void)
20030 +{
20031 +       WARN_ON(swork_get());
20032 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
20033 +       return 0;
20034 +}
20035 +early_initcall(create_clock_set_delay_thread);
20036 +#else /* PREEMPT_RT_FULL */
20037 +
20038  static void clock_was_set_work(struct work_struct *work)
20039  {
20040         clock_was_set();
20041 @@ -710,6 +734,7 @@ void clock_was_set_delayed(void)
20042  {
20043         schedule_work(&hrtimer_work);
20044  }
20045 +#endif
20046
20047  #else
20048
20049 @@ -719,11 +744,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
20050  static inline void hrtimer_switch_to_hres(void) { }
20051  static inline void
20052  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
20053 -static inline int hrtimer_reprogram(struct hrtimer *timer,
20054 -                                   struct hrtimer_clock_base *base)
20055 -{
20056 -       return 0;
20057 -}
20058 +static inline void hrtimer_reprogram(struct hrtimer *timer,
20059 +                                    struct hrtimer_clock_base *base) { }
20060  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
20061  static inline void retrigger_next_event(void *arg) { }
20062
20063 @@ -855,6 +877,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
20064  }
20065  EXPORT_SYMBOL_GPL(hrtimer_forward);
20066
20067 +#ifdef CONFIG_PREEMPT_RT_BASE
20068 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
20069 +
20070 +/**
20071 + * hrtimer_wait_for_timer - Wait for a running timer
20072 + *
20073 + * @timer:     timer to wait for
20074 + *
20075 + * The function waits in case the timers callback function is
20076 + * currently executed on the waitqueue of the timer base. The
20077 + * waitqueue is woken up after the timer callback function has
20078 + * finished execution.
20079 + */
20080 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
20081 +{
20082 +       struct hrtimer_clock_base *base = timer->base;
20083 +
20084 +       if (base && base->cpu_base && !timer->irqsafe)
20085 +               wait_event(base->cpu_base->wait,
20086 +                               !(hrtimer_callback_running(timer)));
20087 +}
20088 +
20089 +#else
20090 +# define wake_up_timer_waiters(b)      do { } while (0)
20091 +#endif
20092 +
20093  /*
20094   * enqueue_hrtimer - internal function to (re)start a timer
20095   *
20096 @@ -896,6 +944,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
20097         if (!(state & HRTIMER_STATE_ENQUEUED))
20098                 return;
20099
20100 +       if (unlikely(!list_empty(&timer->cb_entry))) {
20101 +               list_del_init(&timer->cb_entry);
20102 +               return;
20103 +       }
20104 +
20105         if (!timerqueue_del(&base->active, &timer->node))
20106                 cpu_base->active_bases &= ~(1 << base->index);
20107
20108 @@ -991,7 +1044,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
20109         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
20110
20111         timer_stats_hrtimer_set_start_info(timer);
20112 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20113 +       {
20114 +               ktime_t now = new_base->get_time();
20115
20116 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
20117 +                       timer->praecox = now;
20118 +               else
20119 +                       timer->praecox = ktime_set(0, 0);
20120 +       }
20121 +#endif
20122         leftmost = enqueue_hrtimer(timer, new_base);
20123         if (!leftmost)
20124                 goto unlock;
20125 @@ -1063,7 +1125,7 @@ int hrtimer_cancel(struct hrtimer *timer)
20126
20127                 if (ret >= 0)
20128                         return ret;
20129 -               cpu_relax();
20130 +               hrtimer_wait_for_timer(timer);
20131         }
20132  }
20133  EXPORT_SYMBOL_GPL(hrtimer_cancel);
20134 @@ -1127,6 +1189,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
20135
20136         base = hrtimer_clockid_to_base(clock_id);
20137         timer->base = &cpu_base->clock_base[base];
20138 +       INIT_LIST_HEAD(&timer->cb_entry);
20139         timerqueue_init(&timer->node);
20140
20141  #ifdef CONFIG_TIMER_STATS
20142 @@ -1167,6 +1230,7 @@ bool hrtimer_active(const struct hrtimer *timer)
20143                 seq = raw_read_seqcount_begin(&cpu_base->seq);
20144
20145                 if (timer->state != HRTIMER_STATE_INACTIVE ||
20146 +                   cpu_base->running_soft == timer ||
20147                     cpu_base->running == timer)
20148                         return true;
20149
20150 @@ -1265,10 +1329,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
20151         cpu_base->running = NULL;
20152  }
20153
20154 +#ifdef CONFIG_PREEMPT_RT_BASE
20155 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
20156 +                                struct hrtimer_clock_base *base)
20157 +{
20158 +       int leftmost;
20159 +
20160 +       if (restart != HRTIMER_NORESTART &&
20161 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
20162 +
20163 +               leftmost = enqueue_hrtimer(timer, base);
20164 +               if (!leftmost)
20165 +                       return;
20166 +#ifdef CONFIG_HIGH_RES_TIMERS
20167 +               if (!hrtimer_is_hres_active(timer)) {
20168 +                       /*
20169 +                        * Kick to reschedule the next tick to handle the new timer
20170 +                        * on dynticks target.
20171 +                        */
20172 +                       if (base->cpu_base->nohz_active)
20173 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
20174 +               } else {
20175 +
20176 +                       hrtimer_reprogram(timer, base);
20177 +               }
20178 +#endif
20179 +       }
20180 +}
20181 +
20182 +/*
20183 + * The changes in mainline which removed the callback modes from
20184 + * hrtimer are not yet working with -rt. The non wakeup_process()
20185 + * based callbacks which involve sleeping locks need to be treated
20186 + * seperately.
20187 + */
20188 +static void hrtimer_rt_run_pending(void)
20189 +{
20190 +       enum hrtimer_restart (*fn)(struct hrtimer *);
20191 +       struct hrtimer_cpu_base *cpu_base;
20192 +       struct hrtimer_clock_base *base;
20193 +       struct hrtimer *timer;
20194 +       int index, restart;
20195 +
20196 +       local_irq_disable();
20197 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
20198 +
20199 +       raw_spin_lock(&cpu_base->lock);
20200 +
20201 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
20202 +               base = &cpu_base->clock_base[index];
20203 +
20204 +               while (!list_empty(&base->expired)) {
20205 +                       timer = list_first_entry(&base->expired,
20206 +                                                struct hrtimer, cb_entry);
20207 +
20208 +                       /*
20209 +                        * Same as the above __run_hrtimer function
20210 +                        * just we run with interrupts enabled.
20211 +                        */
20212 +                       debug_deactivate(timer);
20213 +                       cpu_base->running_soft = timer;
20214 +                       raw_write_seqcount_barrier(&cpu_base->seq);
20215 +
20216 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
20217 +                       timer_stats_account_hrtimer(timer);
20218 +                       fn = timer->function;
20219 +
20220 +                       raw_spin_unlock_irq(&cpu_base->lock);
20221 +                       restart = fn(timer);
20222 +                       raw_spin_lock_irq(&cpu_base->lock);
20223 +
20224 +                       hrtimer_rt_reprogram(restart, timer, base);
20225 +                       raw_write_seqcount_barrier(&cpu_base->seq);
20226 +
20227 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
20228 +                       cpu_base->running_soft = NULL;
20229 +               }
20230 +       }
20231 +
20232 +       raw_spin_unlock_irq(&cpu_base->lock);
20233 +
20234 +       wake_up_timer_waiters(cpu_base);
20235 +}
20236 +
20237 +static int hrtimer_rt_defer(struct hrtimer *timer)
20238 +{
20239 +       if (timer->irqsafe)
20240 +               return 0;
20241 +
20242 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
20243 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
20244 +       return 1;
20245 +}
20246 +
20247 +#else
20248 +
20249 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
20250 +
20251 +#endif
20252 +
20253 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
20254 +
20255  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20256  {
20257         struct hrtimer_clock_base *base = cpu_base->clock_base;
20258         unsigned int active = cpu_base->active_bases;
20259 +       int raise = 0;
20260
20261         for (; active; base++, active >>= 1) {
20262                 struct timerqueue_node *node;
20263 @@ -1284,6 +1450,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20264
20265                         timer = container_of(node, struct hrtimer, node);
20266
20267 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
20268 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
20269 +                               timer->praecox : hrtimer_get_expires(timer),
20270 +                               basenow)),
20271 +                           current,
20272 +                           timer->function == hrtimer_wakeup ?
20273 +                           container_of(timer, struct hrtimer_sleeper,
20274 +                               timer)->task : NULL);
20275 +
20276                         /*
20277                          * The immediate goal for using the softexpires is
20278                          * minimizing wakeups, not running timers at the
20279 @@ -1299,9 +1474,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
20280                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
20281                                 break;
20282
20283 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
20284 +                       if (!hrtimer_rt_defer(timer))
20285 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
20286 +                       else
20287 +                               raise = 1;
20288                 }
20289         }
20290 +       if (raise)
20291 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
20292  }
20293
20294  #ifdef CONFIG_HIGH_RES_TIMERS
20295 @@ -1464,16 +1644,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
20296  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
20297  {
20298         sl->timer.function = hrtimer_wakeup;
20299 +       sl->timer.irqsafe = 1;
20300         sl->task = task;
20301  }
20302  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
20303
20304 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
20305 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
20306 +                               unsigned long state)
20307  {
20308         hrtimer_init_sleeper(t, current);
20309
20310         do {
20311 -               set_current_state(TASK_INTERRUPTIBLE);
20312 +               set_current_state(state);
20313                 hrtimer_start_expires(&t->timer, mode);
20314
20315                 if (likely(t->task))
20316 @@ -1515,7 +1697,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
20317                                 HRTIMER_MODE_ABS);
20318         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
20319
20320 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
20321 +       /* cpu_chill() does not care about restart state. */
20322 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
20323                 goto out;
20324
20325         rmtp = restart->nanosleep.rmtp;
20326 @@ -1532,8 +1715,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
20327         return ret;
20328  }
20329
20330 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
20331 -                      const enum hrtimer_mode mode, const clockid_t clockid)
20332 +static long
20333 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
20334 +                   const enum hrtimer_mode mode, const clockid_t clockid,
20335 +                   unsigned long state)
20336  {
20337         struct restart_block *restart;
20338         struct hrtimer_sleeper t;
20339 @@ -1546,7 +1731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
20340
20341         hrtimer_init_on_stack(&t.timer, clockid, mode);
20342         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
20343 -       if (do_nanosleep(&t, mode))
20344 +       if (do_nanosleep(&t, mode, state))
20345                 goto out;
20346
20347         /* Absolute timers do not update the rmtp value and restart: */
20348 @@ -1573,6 +1758,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
20349         return ret;
20350  }
20351
20352 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
20353 +                      const enum hrtimer_mode mode, const clockid_t clockid)
20354 +{
20355 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
20356 +}
20357 +
20358  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
20359                 struct timespec __user *, rmtp)
20360  {
20361 @@ -1587,6 +1778,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
20362         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
20363  }
20364
20365 +#ifdef CONFIG_PREEMPT_RT_FULL
20366 +/*
20367 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
20368 + */
20369 +void cpu_chill(void)
20370 +{
20371 +       struct timespec tu = {
20372 +               .tv_nsec = NSEC_PER_MSEC,
20373 +       };
20374 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
20375 +
20376 +       current->flags |= PF_NOFREEZE;
20377 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
20378 +                           TASK_UNINTERRUPTIBLE);
20379 +       if (!freeze_flag)
20380 +               current->flags &= ~PF_NOFREEZE;
20381 +}
20382 +EXPORT_SYMBOL(cpu_chill);
20383 +#endif
20384 +
20385  /*
20386   * Functions related to boot-time initialization:
20387   */
20388 @@ -1598,10 +1809,14 @@ int hrtimers_prepare_cpu(unsigned int cpu)
20389         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
20390                 cpu_base->clock_base[i].cpu_base = cpu_base;
20391                 timerqueue_init_head(&cpu_base->clock_base[i].active);
20392 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
20393         }
20394
20395         cpu_base->cpu = cpu;
20396         hrtimer_init_hres(cpu_base);
20397 +#ifdef CONFIG_PREEMPT_RT_BASE
20398 +       init_waitqueue_head(&cpu_base->wait);
20399 +#endif
20400         return 0;
20401  }
20402
20403 @@ -1671,9 +1886,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
20404
20405  #endif /* CONFIG_HOTPLUG_CPU */
20406
20407 +#ifdef CONFIG_PREEMPT_RT_BASE
20408 +
20409 +static void run_hrtimer_softirq(struct softirq_action *h)
20410 +{
20411 +       hrtimer_rt_run_pending();
20412 +}
20413 +
20414 +static void hrtimers_open_softirq(void)
20415 +{
20416 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
20417 +}
20418 +
20419 +#else
20420 +static void hrtimers_open_softirq(void) { }
20421 +#endif
20422 +
20423  void __init hrtimers_init(void)
20424  {
20425         hrtimers_prepare_cpu(smp_processor_id());
20426 +       hrtimers_open_softirq();
20427  }
20428
20429  /**
20430 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
20431 index 1d5c7204ddc9..184de6751180 100644
20432 --- a/kernel/time/itimer.c
20433 +++ b/kernel/time/itimer.c
20434 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
20435                 /* We are sharing ->siglock with it_real_fn() */
20436                 if (hrtimer_try_to_cancel(timer) < 0) {
20437                         spin_unlock_irq(&tsk->sighand->siglock);
20438 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
20439                         goto again;
20440                 }
20441                 expires = timeval_to_ktime(value->it_value);
20442 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
20443 index 555e21f7b966..a5d6435fabbb 100644
20444 --- a/kernel/time/jiffies.c
20445 +++ b/kernel/time/jiffies.c
20446 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
20447         .max_cycles     = 10,
20448  };
20449
20450 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
20451 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
20452 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
20453
20454  #if (BITS_PER_LONG < 64)
20455  u64 get_jiffies_64(void)
20456 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
20457         u64 ret;
20458
20459         do {
20460 -               seq = read_seqbegin(&jiffies_lock);
20461 +               seq = read_seqcount_begin(&jiffies_seq);
20462                 ret = jiffies_64;
20463 -       } while (read_seqretry(&jiffies_lock, seq));
20464 +       } while (read_seqcount_retry(&jiffies_seq, seq));
20465         return ret;
20466  }
20467  EXPORT_SYMBOL(get_jiffies_64);
20468 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
20469 index 6df8927c58a5..05b7391bf9bd 100644
20470 --- a/kernel/time/ntp.c
20471 +++ b/kernel/time/ntp.c
20472 @@ -17,6 +17,7 @@
20473  #include <linux/module.h>
20474  #include <linux/rtc.h>
20475  #include <linux/math64.h>
20476 +#include <linux/swork.h>
20477
20478  #include "ntp_internal.h"
20479  #include "timekeeping_internal.h"
20480 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
20481                            &sync_cmos_work, timespec64_to_jiffies(&next));
20482  }
20483
20484 +#ifdef CONFIG_PREEMPT_RT_FULL
20485 +
20486 +static void run_clock_set_delay(struct swork_event *event)
20487 +{
20488 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
20489 +}
20490 +
20491 +static struct swork_event ntp_cmos_swork;
20492 +
20493 +void ntp_notify_cmos_timer(void)
20494 +{
20495 +       swork_queue(&ntp_cmos_swork);
20496 +}
20497 +
20498 +static __init int create_cmos_delay_thread(void)
20499 +{
20500 +       WARN_ON(swork_get());
20501 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
20502 +       return 0;
20503 +}
20504 +early_initcall(create_cmos_delay_thread);
20505 +
20506 +#else
20507 +
20508  void ntp_notify_cmos_timer(void)
20509  {
20510         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
20511  }
20512 +#endif /* CONFIG_PREEMPT_RT_FULL */
20513
20514  #else
20515  void ntp_notify_cmos_timer(void) { }
20516 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
20517 index 39008d78927a..633f4eaca9e7 100644
20518 --- a/kernel/time/posix-cpu-timers.c
20519 +++ b/kernel/time/posix-cpu-timers.c
20520 @@ -3,6 +3,7 @@
20521   */
20522
20523  #include <linux/sched.h>
20524 +#include <linux/sched/rt.h>
20525  #include <linux/posix-timers.h>
20526  #include <linux/errno.h>
20527  #include <linux/math64.h>
20528 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
20529         /*
20530          * Disarm any old timer after extracting its expiry time.
20531          */
20532 -       WARN_ON_ONCE(!irqs_disabled());
20533 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
20534
20535         ret = 0;
20536         old_incr = timer->it.cpu.incr;
20537 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
20538         /*
20539          * Now re-arm for the new expiry time.
20540          */
20541 -       WARN_ON_ONCE(!irqs_disabled());
20542 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
20543         arm_timer(timer);
20544         unlock_task_sighand(p, &flags);
20545
20546 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
20547   * already updated our counts.  We need to check if any timers fire now.
20548   * Interrupts are disabled.
20549   */
20550 -void run_posix_cpu_timers(struct task_struct *tsk)
20551 +static void __run_posix_cpu_timers(struct task_struct *tsk)
20552  {
20553         LIST_HEAD(firing);
20554         struct k_itimer *timer, *next;
20555         unsigned long flags;
20556
20557 -       WARN_ON_ONCE(!irqs_disabled());
20558 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
20559
20560         /*
20561          * The fast path checks that there are no expired thread or thread
20562 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
20563         }
20564  }
20565
20566 +#ifdef CONFIG_PREEMPT_RT_BASE
20567 +#include <linux/kthread.h>
20568 +#include <linux/cpu.h>
20569 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
20570 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
20571 +
20572 +static int posix_cpu_timers_thread(void *data)
20573 +{
20574 +       int cpu = (long)data;
20575 +
20576 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
20577 +
20578 +       while (!kthread_should_stop()) {
20579 +               struct task_struct *tsk = NULL;
20580 +               struct task_struct *next = NULL;
20581 +
20582 +               if (cpu_is_offline(cpu))
20583 +                       goto wait_to_die;
20584 +
20585 +               /* grab task list */
20586 +               raw_local_irq_disable();
20587 +               tsk = per_cpu(posix_timer_tasklist, cpu);
20588 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
20589 +               raw_local_irq_enable();
20590 +
20591 +               /* its possible the list is empty, just return */
20592 +               if (!tsk) {
20593 +                       set_current_state(TASK_INTERRUPTIBLE);
20594 +                       schedule();
20595 +                       __set_current_state(TASK_RUNNING);
20596 +                       continue;
20597 +               }
20598 +
20599 +               /* Process task list */
20600 +               while (1) {
20601 +                       /* save next */
20602 +                       next = tsk->posix_timer_list;
20603 +
20604 +                       /* run the task timers, clear its ptr and
20605 +                        * unreference it
20606 +                        */
20607 +                       __run_posix_cpu_timers(tsk);
20608 +                       tsk->posix_timer_list = NULL;
20609 +                       put_task_struct(tsk);
20610 +
20611 +                       /* check if this is the last on the list */
20612 +                       if (next == tsk)
20613 +                               break;
20614 +                       tsk = next;
20615 +               }
20616 +       }
20617 +       return 0;
20618 +
20619 +wait_to_die:
20620 +       /* Wait for kthread_stop */
20621 +       set_current_state(TASK_INTERRUPTIBLE);
20622 +       while (!kthread_should_stop()) {
20623 +               schedule();
20624 +               set_current_state(TASK_INTERRUPTIBLE);
20625 +       }
20626 +       __set_current_state(TASK_RUNNING);
20627 +       return 0;
20628 +}
20629 +
20630 +static inline int __fastpath_timer_check(struct task_struct *tsk)
20631 +{
20632 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
20633 +       if (unlikely(tsk->exit_state))
20634 +               return 0;
20635 +
20636 +       if (!task_cputime_zero(&tsk->cputime_expires))
20637 +                       return 1;
20638 +
20639 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
20640 +                       return 1;
20641 +
20642 +       return 0;
20643 +}
20644 +
20645 +void run_posix_cpu_timers(struct task_struct *tsk)
20646 +{
20647 +       unsigned long cpu = smp_processor_id();
20648 +       struct task_struct *tasklist;
20649 +
20650 +       BUG_ON(!irqs_disabled());
20651 +       if(!per_cpu(posix_timer_task, cpu))
20652 +               return;
20653 +       /* get per-cpu references */
20654 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
20655 +
20656 +       /* check to see if we're already queued */
20657 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
20658 +               get_task_struct(tsk);
20659 +               if (tasklist) {
20660 +                       tsk->posix_timer_list = tasklist;
20661 +               } else {
20662 +                       /*
20663 +                        * The list is terminated by a self-pointing
20664 +                        * task_struct
20665 +                        */
20666 +                       tsk->posix_timer_list = tsk;
20667 +               }
20668 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
20669 +
20670 +               wake_up_process(per_cpu(posix_timer_task, cpu));
20671 +       }
20672 +}
20673 +
20674 +/*
20675 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
20676 + * Here we can start up the necessary migration thread for the new CPU.
20677 + */
20678 +static int posix_cpu_thread_call(struct notifier_block *nfb,
20679 +                                unsigned long action, void *hcpu)
20680 +{
20681 +       int cpu = (long)hcpu;
20682 +       struct task_struct *p;
20683 +       struct sched_param param;
20684 +
20685 +       switch (action) {
20686 +       case CPU_UP_PREPARE:
20687 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
20688 +                                       "posixcputmr/%d",cpu);
20689 +               if (IS_ERR(p))
20690 +                       return NOTIFY_BAD;
20691 +               p->flags |= PF_NOFREEZE;
20692 +               kthread_bind(p, cpu);
20693 +               /* Must be high prio to avoid getting starved */
20694 +               param.sched_priority = MAX_RT_PRIO-1;
20695 +               sched_setscheduler(p, SCHED_FIFO, &param);
20696 +               per_cpu(posix_timer_task,cpu) = p;
20697 +               break;
20698 +       case CPU_ONLINE:
20699 +               /* Strictly unneccessary, as first user will wake it. */
20700 +               wake_up_process(per_cpu(posix_timer_task,cpu));
20701 +               break;
20702 +#ifdef CONFIG_HOTPLUG_CPU
20703 +       case CPU_UP_CANCELED:
20704 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
20705 +               kthread_bind(per_cpu(posix_timer_task, cpu),
20706 +                            cpumask_any(cpu_online_mask));
20707 +               kthread_stop(per_cpu(posix_timer_task,cpu));
20708 +               per_cpu(posix_timer_task,cpu) = NULL;
20709 +               break;
20710 +       case CPU_DEAD:
20711 +               kthread_stop(per_cpu(posix_timer_task,cpu));
20712 +               per_cpu(posix_timer_task,cpu) = NULL;
20713 +               break;
20714 +#endif
20715 +       }
20716 +       return NOTIFY_OK;
20717 +}
20718 +
20719 +/* Register at highest priority so that task migration (migrate_all_tasks)
20720 + * happens before everything else.
20721 + */
20722 +static struct notifier_block posix_cpu_thread_notifier = {
20723 +       .notifier_call = posix_cpu_thread_call,
20724 +       .priority = 10
20725 +};
20726 +
20727 +static int __init posix_cpu_thread_init(void)
20728 +{
20729 +       void *hcpu = (void *)(long)smp_processor_id();
20730 +       /* Start one for boot CPU. */
20731 +       unsigned long cpu;
20732 +
20733 +       /* init the per-cpu posix_timer_tasklets */
20734 +       for_each_possible_cpu(cpu)
20735 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
20736 +
20737 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
20738 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
20739 +       register_cpu_notifier(&posix_cpu_thread_notifier);
20740 +       return 0;
20741 +}
20742 +early_initcall(posix_cpu_thread_init);
20743 +#else /* CONFIG_PREEMPT_RT_BASE */
20744 +void run_posix_cpu_timers(struct task_struct *tsk)
20745 +{
20746 +       __run_posix_cpu_timers(tsk);
20747 +}
20748 +#endif /* CONFIG_PREEMPT_RT_BASE */
20749 +
20750  /*
20751   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
20752   * The tsk->sighand->siglock must be held by the caller.
20753 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
20754 index f2826c35e918..464a98155a0e 100644
20755 --- a/kernel/time/posix-timers.c
20756 +++ b/kernel/time/posix-timers.c
20757 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
20758  static struct pid *good_sigevent(sigevent_t * event)
20759  {
20760         struct task_struct *rtn = current->group_leader;
20761 +       int sig = event->sigev_signo;
20762
20763         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
20764                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
20765 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
20766                 return NULL;
20767
20768         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
20769 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
20770 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
20771 +            sig_kernel_coredump(sig)))
20772                 return NULL;
20773
20774         return task_pid(rtn);
20775 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
20776         return overrun;
20777  }
20778
20779 +/*
20780 + * Protected by RCU!
20781 + */
20782 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
20783 +{
20784 +#ifdef CONFIG_PREEMPT_RT_FULL
20785 +       if (kc->timer_set == common_timer_set)
20786 +               hrtimer_wait_for_timer(&timr->it.real.timer);
20787 +       else
20788 +               /* FIXME: Whacky hack for posix-cpu-timers */
20789 +               schedule_timeout(1);
20790 +#endif
20791 +}
20792 +
20793  /* Set a POSIX.1b interval timer. */
20794  /* timr->it_lock is taken. */
20795  static int
20796 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
20797         if (!timr)
20798                 return -EINVAL;
20799
20800 +       rcu_read_lock();
20801         kc = clockid_to_kclock(timr->it_clock);
20802         if (WARN_ON_ONCE(!kc || !kc->timer_set))
20803                 error = -EINVAL;
20804 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
20805
20806         unlock_timer(timr, flag);
20807         if (error == TIMER_RETRY) {
20808 +               timer_wait_for_callback(kc, timr);
20809                 rtn = NULL;     // We already got the old time...
20810 +               rcu_read_unlock();
20811                 goto retry;
20812         }
20813 +       rcu_read_unlock();
20814
20815         if (old_setting && !error &&
20816             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
20817 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
20818         if (!timer)
20819                 return -EINVAL;
20820
20821 +       rcu_read_lock();
20822         if (timer_delete_hook(timer) == TIMER_RETRY) {
20823                 unlock_timer(timer, flags);
20824 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
20825 +                                       timer);
20826 +               rcu_read_unlock();
20827                 goto retry_delete;
20828         }
20829 +       rcu_read_unlock();
20830
20831         spin_lock(&current->sighand->siglock);
20832         list_del(&timer->list);
20833 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
20834  retry_delete:
20835         spin_lock_irqsave(&timer->it_lock, flags);
20836
20837 -       if (timer_delete_hook(timer) == TIMER_RETRY) {
20838 +       /* On RT we can race with a deletion */
20839 +       if (!timer->it_signal) {
20840                 unlock_timer(timer, flags);
20841 +               return;
20842 +       }
20843 +
20844 +       if (timer_delete_hook(timer) == TIMER_RETRY) {
20845 +               rcu_read_lock();
20846 +               unlock_timer(timer, flags);
20847 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
20848 +                                       timer);
20849 +               rcu_read_unlock();
20850                 goto retry_delete;
20851         }
20852         list_del(&timer->list);
20853 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
20854 index 690b797f522e..fe8ba1619879 100644
20855 --- a/kernel/time/tick-broadcast-hrtimer.c
20856 +++ b/kernel/time/tick-broadcast-hrtimer.c
20857 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
20858  {
20859         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
20860         bctimer.function = bc_handler;
20861 +       bctimer.irqsafe = true;
20862         clockevents_register_device(&ce_broadcast_hrtimer);
20863  }
20864 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
20865 index 4fcd99e12aa0..5a47f2e98faf 100644
20866 --- a/kernel/time/tick-common.c
20867 +++ b/kernel/time/tick-common.c
20868 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
20869  static void tick_periodic(int cpu)
20870  {
20871         if (tick_do_timer_cpu == cpu) {
20872 -               write_seqlock(&jiffies_lock);
20873 +               raw_spin_lock(&jiffies_lock);
20874 +               write_seqcount_begin(&jiffies_seq);
20875
20876                 /* Keep track of the next tick event */
20877                 tick_next_period = ktime_add(tick_next_period, tick_period);
20878
20879                 do_timer(1);
20880 -               write_sequnlock(&jiffies_lock);
20881 +               write_seqcount_end(&jiffies_seq);
20882 +               raw_spin_unlock(&jiffies_lock);
20883                 update_wall_time();
20884         }
20885
20886 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
20887                 ktime_t next;
20888
20889                 do {
20890 -                       seq = read_seqbegin(&jiffies_lock);
20891 +                       seq = read_seqcount_begin(&jiffies_seq);
20892                         next = tick_next_period;
20893 -               } while (read_seqretry(&jiffies_lock, seq));
20894 +               } while (read_seqcount_retry(&jiffies_seq, seq));
20895
20896                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
20897
20898 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
20899 index 3bcb61b52f6c..66d85482a96e 100644
20900 --- a/kernel/time/tick-sched.c
20901 +++ b/kernel/time/tick-sched.c
20902 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
20903                 return;
20904
20905         /* Reevaluate with jiffies_lock held */
20906 -       write_seqlock(&jiffies_lock);
20907 +       raw_spin_lock(&jiffies_lock);
20908 +       write_seqcount_begin(&jiffies_seq);
20909
20910         delta = ktime_sub(now, last_jiffies_update);
20911         if (delta.tv64 >= tick_period.tv64) {
20912 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
20913                 /* Keep the tick_next_period variable up to date */
20914                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
20915         } else {
20916 -               write_sequnlock(&jiffies_lock);
20917 +               write_seqcount_end(&jiffies_seq);
20918 +               raw_spin_unlock(&jiffies_lock);
20919                 return;
20920         }
20921 -       write_sequnlock(&jiffies_lock);
20922 +       write_seqcount_end(&jiffies_seq);
20923 +       raw_spin_unlock(&jiffies_lock);
20924         update_wall_time();
20925  }
20926
20927 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
20928  {
20929         ktime_t period;
20930
20931 -       write_seqlock(&jiffies_lock);
20932 +       raw_spin_lock(&jiffies_lock);
20933 +       write_seqcount_begin(&jiffies_seq);
20934         /* Did we start the jiffies update yet ? */
20935         if (last_jiffies_update.tv64 == 0)
20936                 last_jiffies_update = tick_next_period;
20937         period = last_jiffies_update;
20938 -       write_sequnlock(&jiffies_lock);
20939 +       write_seqcount_end(&jiffies_seq);
20940 +       raw_spin_unlock(&jiffies_lock);
20941         return period;
20942  }
20943
20944 @@ -215,6 +220,7 @@ static void nohz_full_kick_func(struct irq_work *work)
20945
20946  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
20947         .func = nohz_full_kick_func,
20948 +       .flags = IRQ_WORK_HARD_IRQ,
20949  };
20950
20951  /*
20952 @@ -673,10 +679,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
20953
20954         /* Read jiffies and the time when jiffies were updated last */
20955         do {
20956 -               seq = read_seqbegin(&jiffies_lock);
20957 +               seq = read_seqcount_begin(&jiffies_seq);
20958                 basemono = last_jiffies_update.tv64;
20959                 basejiff = jiffies;
20960 -       } while (read_seqretry(&jiffies_lock, seq));
20961 +       } while (read_seqcount_retry(&jiffies_seq, seq));
20962         ts->last_jiffies = basejiff;
20963
20964         if (rcu_needs_cpu(basemono, &next_rcu) ||
20965 @@ -877,14 +883,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
20966                 return false;
20967
20968         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
20969 -               static int ratelimit;
20970 -
20971 -               if (ratelimit < 10 &&
20972 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
20973 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
20974 -                               (unsigned int) local_softirq_pending());
20975 -                       ratelimit++;
20976 -               }
20977 +               softirq_check_pending_idle();
20978                 return false;
20979         }
20980
20981 @@ -1193,6 +1192,7 @@ void tick_setup_sched_timer(void)
20982          * Emulate tick processing via per-CPU hrtimers:
20983          */
20984         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
20985 +       ts->sched_timer.irqsafe = 1;
20986         ts->sched_timer.function = tick_sched_timer;
20987
20988         /* Get the next period (per-CPU) */
20989 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
20990 index 46e312e9be38..fa75cf5d9253 100644
20991 --- a/kernel/time/timekeeping.c
20992 +++ b/kernel/time/timekeeping.c
20993 @@ -2328,8 +2328,10 @@ EXPORT_SYMBOL(hardpps);
20994   */
20995  void xtime_update(unsigned long ticks)
20996  {
20997 -       write_seqlock(&jiffies_lock);
20998 +       raw_spin_lock(&jiffies_lock);
20999 +       write_seqcount_begin(&jiffies_seq);
21000         do_timer(ticks);
21001 -       write_sequnlock(&jiffies_lock);
21002 +       write_seqcount_end(&jiffies_seq);
21003 +       raw_spin_unlock(&jiffies_lock);
21004         update_wall_time();
21005  }
21006 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
21007 index 704f595ce83f..763a3e5121ff 100644
21008 --- a/kernel/time/timekeeping.h
21009 +++ b/kernel/time/timekeeping.h
21010 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
21011  extern void do_timer(unsigned long ticks);
21012  extern void update_wall_time(void);
21013
21014 -extern seqlock_t jiffies_lock;
21015 +extern raw_spinlock_t jiffies_lock;
21016 +extern seqcount_t jiffies_seq;
21017
21018  #define CS_NAME_LEN    32
21019
21020 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
21021 index c611c47de884..cdff4411f8f6 100644
21022 --- a/kernel/time/timer.c
21023 +++ b/kernel/time/timer.c
21024 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
21025  #endif
21026
21027  struct timer_base {
21028 -       spinlock_t              lock;
21029 +       raw_spinlock_t          lock;
21030         struct timer_list       *running_timer;
21031 +#ifdef CONFIG_PREEMPT_RT_FULL
21032 +       struct swait_queue_head wait_for_running_timer;
21033 +#endif
21034         unsigned long           clk;
21035         unsigned long           next_expiry;
21036         unsigned int            cpu;
21037 @@ -203,6 +206,8 @@ struct timer_base {
21038         bool                    is_idle;
21039         DECLARE_BITMAP(pending_map, WHEEL_SIZE);
21040         struct hlist_head       vectors[WHEEL_SIZE];
21041 +       struct hlist_head       expired_lists[LVL_DEPTH];
21042 +       int                     expired_count;
21043  } ____cacheline_aligned;
21044
21045  static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
21046 @@ -948,10 +953,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
21047
21048                 if (!(tf & TIMER_MIGRATING)) {
21049                         base = get_timer_base(tf);
21050 -                       spin_lock_irqsave(&base->lock, *flags);
21051 +                       raw_spin_lock_irqsave(&base->lock, *flags);
21052                         if (timer->flags == tf)
21053                                 return base;
21054 -                       spin_unlock_irqrestore(&base->lock, *flags);
21055 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
21056                 }
21057                 cpu_relax();
21058         }
21059 @@ -1023,9 +1028,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
21060                         /* See the comment in lock_timer_base() */
21061                         timer->flags |= TIMER_MIGRATING;
21062
21063 -                       spin_unlock(&base->lock);
21064 +                       raw_spin_unlock(&base->lock);
21065                         base = new_base;
21066 -                       spin_lock(&base->lock);
21067 +                       raw_spin_lock(&base->lock);
21068                         WRITE_ONCE(timer->flags,
21069                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
21070                 }
21071 @@ -1050,7 +1055,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
21072         }
21073
21074  out_unlock:
21075 -       spin_unlock_irqrestore(&base->lock, flags);
21076 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21077
21078         return ret;
21079  }
21080 @@ -1144,19 +1149,46 @@ void add_timer_on(struct timer_list *timer, int cpu)
21081         if (base != new_base) {
21082                 timer->flags |= TIMER_MIGRATING;
21083
21084 -               spin_unlock(&base->lock);
21085 +               raw_spin_unlock(&base->lock);
21086                 base = new_base;
21087 -               spin_lock(&base->lock);
21088 +               raw_spin_lock(&base->lock);
21089                 WRITE_ONCE(timer->flags,
21090                            (timer->flags & ~TIMER_BASEMASK) | cpu);
21091         }
21092
21093         debug_activate(timer, timer->expires);
21094         internal_add_timer(base, timer);
21095 -       spin_unlock_irqrestore(&base->lock, flags);
21096 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21097  }
21098  EXPORT_SYMBOL_GPL(add_timer_on);
21099
21100 +#ifdef CONFIG_PREEMPT_RT_FULL
21101 +/*
21102 + * Wait for a running timer
21103 + */
21104 +static void wait_for_running_timer(struct timer_list *timer)
21105 +{
21106 +       struct timer_base *base;
21107 +       u32 tf = timer->flags;
21108 +
21109 +       if (tf & TIMER_MIGRATING)
21110 +               return;
21111 +
21112 +       base = get_timer_base(tf);
21113 +       swait_event(base->wait_for_running_timer,
21114 +                  base->running_timer != timer);
21115 +}
21116 +
21117 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
21118 +#else
21119 +static inline void wait_for_running_timer(struct timer_list *timer)
21120 +{
21121 +       cpu_relax();
21122 +}
21123 +
21124 +# define wakeup_timer_waiters(b)       do { } while (0)
21125 +#endif
21126 +
21127  /**
21128   * del_timer - deactive a timer.
21129   * @timer: the timer to be deactivated
21130 @@ -1180,7 +1212,7 @@ int del_timer(struct timer_list *timer)
21131         if (timer_pending(timer)) {
21132                 base = lock_timer_base(timer, &flags);
21133                 ret = detach_if_pending(timer, base, true);
21134 -               spin_unlock_irqrestore(&base->lock, flags);
21135 +               raw_spin_unlock_irqrestore(&base->lock, flags);
21136         }
21137
21138         return ret;
21139 @@ -1208,13 +1240,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
21140                 timer_stats_timer_clear_start_info(timer);
21141                 ret = detach_if_pending(timer, base, true);
21142         }
21143 -       spin_unlock_irqrestore(&base->lock, flags);
21144 +       raw_spin_unlock_irqrestore(&base->lock, flags);
21145
21146         return ret;
21147  }
21148  EXPORT_SYMBOL(try_to_del_timer_sync);
21149
21150 -#ifdef CONFIG_SMP
21151 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
21152  /**
21153   * del_timer_sync - deactivate a timer and wait for the handler to finish.
21154   * @timer: the timer to be deactivated
21155 @@ -1274,7 +1306,7 @@ int del_timer_sync(struct timer_list *timer)
21156                 int ret = try_to_del_timer_sync(timer);
21157                 if (ret >= 0)
21158                         return ret;
21159 -               cpu_relax();
21160 +               wait_for_running_timer(timer);
21161         }
21162  }
21163  EXPORT_SYMBOL(del_timer_sync);
21164 @@ -1323,7 +1355,8 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
21165         }
21166  }
21167
21168 -static void expire_timers(struct timer_base *base, struct hlist_head *head)
21169 +static inline void __expire_timers(struct timer_base *base,
21170 +                                  struct hlist_head *head)
21171  {
21172         while (!hlist_empty(head)) {
21173                 struct timer_list *timer;
21174 @@ -1339,33 +1372,53 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
21175                 fn = timer->function;
21176                 data = timer->data;
21177
21178 -               if (timer->flags & TIMER_IRQSAFE) {
21179 -                       spin_unlock(&base->lock);
21180 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
21181 +                   timer->flags & TIMER_IRQSAFE) {
21182 +                       raw_spin_unlock(&base->lock);
21183                         call_timer_fn(timer, fn, data);
21184 -                       spin_lock(&base->lock);
21185 +                       base->running_timer = NULL;
21186 +                       raw_spin_lock(&base->lock);
21187                 } else {
21188 -                       spin_unlock_irq(&base->lock);
21189 +                       raw_spin_unlock_irq(&base->lock);
21190                         call_timer_fn(timer, fn, data);
21191 -                       spin_lock_irq(&base->lock);
21192 +                       base->running_timer = NULL;
21193 +                       raw_spin_lock_irq(&base->lock);
21194                 }
21195         }
21196  }
21197
21198 -static int __collect_expired_timers(struct timer_base *base,
21199 -                                   struct hlist_head *heads)
21200 +static void expire_timers(struct timer_base *base)
21201 +{
21202 +       struct hlist_head *head;
21203 +
21204 +       while (base->expired_count--) {
21205 +               head = base->expired_lists + base->expired_count;
21206 +               __expire_timers(base, head);
21207 +       }
21208 +       base->expired_count = 0;
21209 +}
21210 +
21211 +static void __collect_expired_timers(struct timer_base *base)
21212  {
21213         unsigned long clk = base->clk;
21214         struct hlist_head *vec;
21215 -       int i, levels = 0;
21216 +       int i;
21217         unsigned int idx;
21218
21219 +       /*
21220 +        * expire_timers() must be called at least once before we can
21221 +        * collect more timers
21222 +        */
21223 +       if (WARN_ON(base->expired_count))
21224 +               return;
21225 +
21226         for (i = 0; i < LVL_DEPTH; i++) {
21227                 idx = (clk & LVL_MASK) + i * LVL_SIZE;
21228
21229                 if (__test_and_clear_bit(idx, base->pending_map)) {
21230                         vec = base->vectors + idx;
21231 -                       hlist_move_list(vec, heads++);
21232 -                       levels++;
21233 +                       hlist_move_list(vec,
21234 +                               &base->expired_lists[base->expired_count++]);
21235                 }
21236                 /* Is it time to look at the next level? */
21237                 if (clk & LVL_CLK_MASK)
21238 @@ -1373,7 +1426,6 @@ static int __collect_expired_timers(struct timer_base *base,
21239                 /* Shift clock for the next level granularity */
21240                 clk >>= LVL_CLK_SHIFT;
21241         }
21242 -       return levels;
21243  }
21244
21245  #ifdef CONFIG_NO_HZ_COMMON
21246 @@ -1515,7 +1567,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
21247         if (cpu_is_offline(smp_processor_id()))
21248                 return expires;
21249
21250 -       spin_lock(&base->lock);
21251 +       raw_spin_lock(&base->lock);
21252         nextevt = __next_timer_interrupt(base);
21253         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
21254         base->next_expiry = nextevt;
21255 @@ -1543,7 +1595,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
21256                 if ((expires - basem) > TICK_NSEC)
21257                         base->is_idle = true;
21258         }
21259 -       spin_unlock(&base->lock);
21260 +       raw_spin_unlock(&base->lock);
21261
21262         return cmp_next_hrtimer_event(basem, expires);
21263  }
21264 @@ -1566,8 +1618,7 @@ void timer_clear_idle(void)
21265         base->is_idle = false;
21266  }
21267
21268 -static int collect_expired_timers(struct timer_base *base,
21269 -                                 struct hlist_head *heads)
21270 +static void collect_expired_timers(struct timer_base *base)
21271  {
21272         /*
21273          * NOHZ optimization. After a long idle sleep we need to forward the
21274 @@ -1584,20 +1635,49 @@ static int collect_expired_timers(struct timer_base *base,
21275                 if (time_after(next, jiffies)) {
21276                         /* The call site will increment clock! */
21277                         base->clk = jiffies - 1;
21278 -                       return 0;
21279 +                       return;
21280                 }
21281                 base->clk = next;
21282         }
21283 -       return __collect_expired_timers(base, heads);
21284 +       __collect_expired_timers(base);
21285  }
21286  #else
21287 -static inline int collect_expired_timers(struct timer_base *base,
21288 -                                        struct hlist_head *heads)
21289 +static inline void collect_expired_timers(struct timer_base *base)
21290  {
21291 -       return __collect_expired_timers(base, heads);
21292 +       __collect_expired_timers(base);
21293  }
21294  #endif
21295
21296 +static int find_expired_timers(struct timer_base *base)
21297 +{
21298 +       const unsigned long int end_clk = jiffies;
21299 +
21300 +       while (!base->expired_count && time_after_eq(end_clk, base->clk)) {
21301 +               collect_expired_timers(base);
21302 +               base->clk++;
21303 +       }
21304 +
21305 +       return base->expired_count;
21306 +}
21307 +
21308 +/* Called from CPU tick routine to quickly collect expired timers */
21309 +static int tick_find_expired(struct timer_base *base)
21310 +{
21311 +       int count;
21312 +
21313 +       raw_spin_lock(&base->lock);
21314 +
21315 +       if (unlikely(time_after(jiffies, base->clk + HZ))) {
21316 +               /* defer to ktimersoftd; don't spend too long in irq context */
21317 +               count = -1;
21318 +       } else
21319 +               count = find_expired_timers(base);
21320 +
21321 +       raw_spin_unlock(&base->lock);
21322 +
21323 +       return count;
21324 +}
21325 +
21326  /*
21327   * Called from the timer interrupt handler to charge one tick to the current
21328   * process.  user_tick is 1 if the tick is user time, 0 for system.
21329 @@ -1608,13 +1688,13 @@ void update_process_times(int user_tick)
21330
21331         /* Note: this timer irq context must be accounted for as well. */
21332         account_process_tick(p, user_tick);
21333 +       scheduler_tick();
21334         run_local_timers();
21335         rcu_check_callbacks(user_tick);
21336 -#ifdef CONFIG_IRQ_WORK
21337 +#if defined(CONFIG_IRQ_WORK)
21338         if (in_irq())
21339                 irq_work_tick();
21340  #endif
21341 -       scheduler_tick();
21342         run_posix_cpu_timers(p);
21343  }
21344
21345 @@ -1624,24 +1704,13 @@ void update_process_times(int user_tick)
21346   */
21347  static inline void __run_timers(struct timer_base *base)
21348  {
21349 -       struct hlist_head heads[LVL_DEPTH];
21350 -       int levels;
21351 +       raw_spin_lock_irq(&base->lock);
21352
21353 -       if (!time_after_eq(jiffies, base->clk))
21354 -               return;
21355 +       while (find_expired_timers(base))
21356 +               expire_timers(base);
21357
21358 -       spin_lock_irq(&base->lock);
21359 -
21360 -       while (time_after_eq(jiffies, base->clk)) {
21361 -
21362 -               levels = collect_expired_timers(base, heads);
21363 -               base->clk++;
21364 -
21365 -               while (levels--)
21366 -                       expire_timers(base, heads + levels);
21367 -       }
21368 -       base->running_timer = NULL;
21369 -       spin_unlock_irq(&base->lock);
21370 +       raw_spin_unlock_irq(&base->lock);
21371 +       wakeup_timer_waiters(base);
21372  }
21373
21374  /*
21375 @@ -1651,6 +1720,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
21376  {
21377         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
21378
21379 +       irq_work_tick_soft();
21380 +
21381         __run_timers(base);
21382         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
21383                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
21384 @@ -1665,12 +1736,12 @@ void run_local_timers(void)
21385
21386         hrtimer_run_queues();
21387         /* Raise the softirq only if required. */
21388 -       if (time_before(jiffies, base->clk)) {
21389 +       if (time_before(jiffies, base->clk) || !tick_find_expired(base)) {
21390                 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
21391                         return;
21392                 /* CPU is awake, so check the deferrable base. */
21393                 base++;
21394 -               if (time_before(jiffies, base->clk))
21395 +               if (time_before(jiffies, base->clk) || !tick_find_expired(base))
21396                         return;
21397         }
21398         raise_softirq(TIMER_SOFTIRQ);
21399 @@ -1836,16 +1907,17 @@ int timers_dead_cpu(unsigned int cpu)
21400                  * The caller is globally serialized and nobody else
21401                  * takes two locks at once, deadlock is not possible.
21402                  */
21403 -               spin_lock_irq(&new_base->lock);
21404 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
21405 +               raw_spin_lock_irq(&new_base->lock);
21406 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
21407
21408                 BUG_ON(old_base->running_timer);
21409 +               BUG_ON(old_base->expired_count);
21410
21411                 for (i = 0; i < WHEEL_SIZE; i++)
21412                         migrate_timer_list(new_base, old_base->vectors + i);
21413
21414 -               spin_unlock(&old_base->lock);
21415 -               spin_unlock_irq(&new_base->lock);
21416 +               raw_spin_unlock(&old_base->lock);
21417 +               raw_spin_unlock_irq(&new_base->lock);
21418                 put_cpu_ptr(&timer_bases);
21419         }
21420         return 0;
21421 @@ -1861,8 +1933,12 @@ static void __init init_timer_cpu(int cpu)
21422         for (i = 0; i < NR_BASES; i++) {
21423                 base = per_cpu_ptr(&timer_bases[i], cpu);
21424                 base->cpu = cpu;
21425 -               spin_lock_init(&base->lock);
21426 +               raw_spin_lock_init(&base->lock);
21427                 base->clk = jiffies;
21428 +#ifdef CONFIG_PREEMPT_RT_FULL
21429 +               init_swait_queue_head(&base->wait_for_running_timer);
21430 +#endif
21431 +               base->expired_count = 0;
21432         }
21433  }
21434
21435 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
21436 index 2a96b063d659..812e37237eb8 100644
21437 --- a/kernel/trace/Kconfig
21438 +++ b/kernel/trace/Kconfig
21439 @@ -182,6 +182,24 @@ config IRQSOFF_TRACER
21440           enabled. This option and the preempt-off timing option can be
21441           used together or separately.)
21442
21443 +config INTERRUPT_OFF_HIST
21444 +       bool "Interrupts-off Latency Histogram"
21445 +       depends on IRQSOFF_TRACER
21446 +       help
21447 +         This option generates continuously updated histograms (one per cpu)
21448 +         of the duration of time periods with interrupts disabled. The
21449 +         histograms are disabled by default. To enable them, write a non-zero
21450 +         number to
21451 +
21452 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
21453 +
21454 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
21455 +         per cpu) are generated that accumulate the duration of time periods
21456 +         when both interrupts and preemption are disabled. The histogram data
21457 +         will be located in the debug file system at
21458 +
21459 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
21460 +
21461  config PREEMPT_TRACER
21462         bool "Preemption-off Latency Tracer"
21463         default n
21464 @@ -206,6 +224,24 @@ config PREEMPT_TRACER
21465           enabled. This option and the irqs-off timing option can be
21466           used together or separately.)
21467
21468 +config PREEMPT_OFF_HIST
21469 +       bool "Preemption-off Latency Histogram"
21470 +       depends on PREEMPT_TRACER
21471 +       help
21472 +         This option generates continuously updated histograms (one per cpu)
21473 +         of the duration of time periods with preemption disabled. The
21474 +         histograms are disabled by default. To enable them, write a non-zero
21475 +         number to
21476 +
21477 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
21478 +
21479 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
21480 +         per cpu) are generated that accumulate the duration of time periods
21481 +         when both interrupts and preemption are disabled. The histogram data
21482 +         will be located in the debug file system at
21483 +
21484 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
21485 +
21486  config SCHED_TRACER
21487         bool "Scheduling Latency Tracer"
21488         select GENERIC_TRACER
21489 @@ -251,6 +287,74 @@ config HWLAT_TRACER
21490          file. Every time a latency is greater than tracing_thresh, it will
21491          be recorded into the ring buffer.
21492
21493 +config WAKEUP_LATENCY_HIST
21494 +       bool "Scheduling Latency Histogram"
21495 +       depends on SCHED_TRACER
21496 +       help
21497 +         This option generates continuously updated histograms (one per cpu)
21498 +         of the scheduling latency of the highest priority task.
21499 +         The histograms are disabled by default. To enable them, write a
21500 +         non-zero number to
21501 +
21502 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
21503 +
21504 +         Two different algorithms are used, one to determine the latency of
21505 +         processes that exclusively use the highest priority of the system and
21506 +         another one to determine the latency of processes that share the
21507 +         highest system priority with other processes. The former is used to
21508 +         improve hardware and system software, the latter to optimize the
21509 +         priority design of a given system. The histogram data will be
21510 +         located in the debug file system at
21511 +
21512 +             /sys/kernel/debug/tracing/latency_hist/wakeup
21513 +
21514 +         and
21515 +
21516 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
21517 +
21518 +         If both Scheduling Latency Histogram and Missed Timer Offsets
21519 +         Histogram are selected, additional histogram data will be collected
21520 +         that contain, in addition to the wakeup latency, the timer latency, in
21521 +         case the wakeup was triggered by an expired timer. These histograms
21522 +         are available in the
21523 +
21524 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
21525 +
21526 +         directory. They reflect the apparent interrupt and scheduling latency
21527 +         and are best suitable to determine the worst-case latency of a given
21528 +         system. To enable these histograms, write a non-zero number to
21529 +
21530 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
21531 +
21532 +config MISSED_TIMER_OFFSETS_HIST
21533 +       depends on HIGH_RES_TIMERS
21534 +       select GENERIC_TRACER
21535 +       bool "Missed Timer Offsets Histogram"
21536 +       help
21537 +         Generate a histogram of missed timer offsets in microseconds. The
21538 +         histograms are disabled by default. To enable them, write a non-zero
21539 +         number to
21540 +
21541 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
21542 +
21543 +         The histogram data will be located in the debug file system at
21544 +
21545 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
21546 +
21547 +         If both Scheduling Latency Histogram and Missed Timer Offsets
21548 +         Histogram are selected, additional histogram data will be collected
21549 +         that contain, in addition to the wakeup latency, the timer latency, in
21550 +         case the wakeup was triggered by an expired timer. These histograms
21551 +         are available in the
21552 +
21553 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
21554 +
21555 +         directory. They reflect the apparent interrupt and scheduling latency
21556 +         and are best suitable to determine the worst-case latency of a given
21557 +         system. To enable these histograms, write a non-zero number to
21558 +
21559 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
21560 +
21561  config ENABLE_DEFAULT_TRACERS
21562         bool "Trace process context switches and events"
21563         depends on !GENERIC_TRACER
21564 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
21565 index e57980845549..83af000b783c 100644
21566 --- a/kernel/trace/Makefile
21567 +++ b/kernel/trace/Makefile
21568 @@ -38,6 +38,10 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
21569  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
21570  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
21571  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
21572 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
21573 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
21574 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
21575 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
21576  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
21577  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
21578  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
21579 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
21580 new file mode 100644
21581 index 000000000000..7f6ee70dea41
21582 --- /dev/null
21583 +++ b/kernel/trace/latency_hist.c
21584 @@ -0,0 +1,1178 @@
21585 +/*
21586 + * kernel/trace/latency_hist.c
21587 + *
21588 + * Add support for histograms of preemption-off latency and
21589 + * interrupt-off latency and wakeup latency, it depends on
21590 + * Real-Time Preemption Support.
21591 + *
21592 + *  Copyright (C) 2005 MontaVista Software, Inc.
21593 + *  Yi Yang <yyang@ch.mvista.com>
21594 + *
21595 + *  Converted to work with the new latency tracer.
21596 + *  Copyright (C) 2008 Red Hat, Inc.
21597 + *    Steven Rostedt <srostedt@redhat.com>
21598 + *
21599 + */
21600 +#include <linux/module.h>
21601 +#include <linux/debugfs.h>
21602 +#include <linux/seq_file.h>
21603 +#include <linux/percpu.h>
21604 +#include <linux/kallsyms.h>
21605 +#include <linux/uaccess.h>
21606 +#include <linux/sched.h>
21607 +#include <linux/sched/rt.h>
21608 +#include <linux/slab.h>
21609 +#include <linux/atomic.h>
21610 +#include <asm/div64.h>
21611 +
21612 +#include "trace.h"
21613 +#include <trace/events/sched.h>
21614 +
21615 +#define NSECS_PER_USECS 1000L
21616 +
21617 +#define CREATE_TRACE_POINTS
21618 +#include <trace/events/hist.h>
21619 +
21620 +enum {
21621 +       IRQSOFF_LATENCY = 0,
21622 +       PREEMPTOFF_LATENCY,
21623 +       PREEMPTIRQSOFF_LATENCY,
21624 +       WAKEUP_LATENCY,
21625 +       WAKEUP_LATENCY_SHAREDPRIO,
21626 +       MISSED_TIMER_OFFSETS,
21627 +       TIMERANDWAKEUP_LATENCY,
21628 +       MAX_LATENCY_TYPE,
21629 +};
21630 +
21631 +#define MAX_ENTRY_NUM 10240
21632 +
21633 +struct hist_data {
21634 +       atomic_t hist_mode; /* 0 log, 1 don't log */
21635 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
21636 +       long min_lat;
21637 +       long max_lat;
21638 +       unsigned long long below_hist_bound_samples;
21639 +       unsigned long long above_hist_bound_samples;
21640 +       long long accumulate_lat;
21641 +       unsigned long long total_samples;
21642 +       unsigned long long hist_array[MAX_ENTRY_NUM];
21643 +};
21644 +
21645 +struct enable_data {
21646 +       int latency_type;
21647 +       int enabled;
21648 +};
21649 +
21650 +static char *latency_hist_dir_root = "latency_hist";
21651 +
21652 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21653 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
21654 +static char *irqsoff_hist_dir = "irqsoff";
21655 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
21656 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
21657 +#endif
21658 +
21659 +#ifdef CONFIG_PREEMPT_OFF_HIST
21660 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
21661 +static char *preemptoff_hist_dir = "preemptoff";
21662 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
21663 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
21664 +#endif
21665 +
21666 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
21667 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
21668 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
21669 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
21670 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
21671 +#endif
21672 +
21673 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
21674 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
21675 +static struct enable_data preemptirqsoff_enabled_data = {
21676 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
21677 +       .enabled = 0,
21678 +};
21679 +#endif
21680 +
21681 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21682 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21683 +struct maxlatproc_data {
21684 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
21685 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
21686 +       int pid;
21687 +       int current_pid;
21688 +       int prio;
21689 +       int current_prio;
21690 +       long latency;
21691 +       long timeroffset;
21692 +       cycle_t timestamp;
21693 +};
21694 +#endif
21695 +
21696 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21697 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
21698 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
21699 +static char *wakeup_latency_hist_dir = "wakeup";
21700 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
21701 +static notrace void probe_wakeup_latency_hist_start(void *v,
21702 +       struct task_struct *p);
21703 +static notrace void probe_wakeup_latency_hist_stop(void *v,
21704 +       bool preempt, struct task_struct *prev, struct task_struct *next);
21705 +static notrace void probe_sched_migrate_task(void *,
21706 +       struct task_struct *task, int cpu);
21707 +static struct enable_data wakeup_latency_enabled_data = {
21708 +       .latency_type = WAKEUP_LATENCY,
21709 +       .enabled = 0,
21710 +};
21711 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
21712 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
21713 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
21714 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
21715 +static unsigned long wakeup_pid;
21716 +#endif
21717 +
21718 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21719 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
21720 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
21721 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
21722 +       long long offset, struct task_struct *curr, struct task_struct *task);
21723 +static struct enable_data missed_timer_offsets_enabled_data = {
21724 +       .latency_type = MISSED_TIMER_OFFSETS,
21725 +       .enabled = 0,
21726 +};
21727 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
21728 +static unsigned long missed_timer_offsets_pid;
21729 +#endif
21730 +
21731 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21732 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21733 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
21734 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
21735 +static struct enable_data timerandwakeup_enabled_data = {
21736 +       .latency_type = TIMERANDWAKEUP_LATENCY,
21737 +       .enabled = 0,
21738 +};
21739 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
21740 +#endif
21741 +
21742 +void notrace latency_hist(int latency_type, int cpu, long latency,
21743 +                         long timeroffset, cycle_t stop,
21744 +                         struct task_struct *p)
21745 +{
21746 +       struct hist_data *my_hist;
21747 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21748 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21749 +       struct maxlatproc_data *mp = NULL;
21750 +#endif
21751 +
21752 +       if (!cpu_possible(cpu) || latency_type < 0 ||
21753 +           latency_type >= MAX_LATENCY_TYPE)
21754 +               return;
21755 +
21756 +       switch (latency_type) {
21757 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21758 +       case IRQSOFF_LATENCY:
21759 +               my_hist = &per_cpu(irqsoff_hist, cpu);
21760 +               break;
21761 +#endif
21762 +#ifdef CONFIG_PREEMPT_OFF_HIST
21763 +       case PREEMPTOFF_LATENCY:
21764 +               my_hist = &per_cpu(preemptoff_hist, cpu);
21765 +               break;
21766 +#endif
21767 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
21768 +       case PREEMPTIRQSOFF_LATENCY:
21769 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
21770 +               break;
21771 +#endif
21772 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21773 +       case WAKEUP_LATENCY:
21774 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
21775 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
21776 +               break;
21777 +       case WAKEUP_LATENCY_SHAREDPRIO:
21778 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
21779 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
21780 +               break;
21781 +#endif
21782 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21783 +       case MISSED_TIMER_OFFSETS:
21784 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
21785 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
21786 +               break;
21787 +#endif
21788 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21789 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21790 +       case TIMERANDWAKEUP_LATENCY:
21791 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
21792 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
21793 +               break;
21794 +#endif
21795 +
21796 +       default:
21797 +               return;
21798 +       }
21799 +
21800 +       latency += my_hist->offset;
21801 +
21802 +       if (atomic_read(&my_hist->hist_mode) == 0)
21803 +               return;
21804 +
21805 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
21806 +               if (latency < 0)
21807 +                       my_hist->below_hist_bound_samples++;
21808 +               else
21809 +                       my_hist->above_hist_bound_samples++;
21810 +       } else
21811 +               my_hist->hist_array[latency]++;
21812 +
21813 +       if (unlikely(latency > my_hist->max_lat ||
21814 +           my_hist->min_lat == LONG_MAX)) {
21815 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21816 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21817 +               if (latency_type == WAKEUP_LATENCY ||
21818 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
21819 +                   latency_type == MISSED_TIMER_OFFSETS ||
21820 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
21821 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
21822 +                       strncpy(mp->current_comm, current->comm,
21823 +                           sizeof(mp->current_comm));
21824 +                       mp->pid = task_pid_nr(p);
21825 +                       mp->current_pid = task_pid_nr(current);
21826 +                       mp->prio = p->prio;
21827 +                       mp->current_prio = current->prio;
21828 +                       mp->latency = latency;
21829 +                       mp->timeroffset = timeroffset;
21830 +                       mp->timestamp = stop;
21831 +               }
21832 +#endif
21833 +               my_hist->max_lat = latency;
21834 +       }
21835 +       if (unlikely(latency < my_hist->min_lat))
21836 +               my_hist->min_lat = latency;
21837 +       my_hist->total_samples++;
21838 +       my_hist->accumulate_lat += latency;
21839 +}
21840 +
21841 +static void *l_start(struct seq_file *m, loff_t *pos)
21842 +{
21843 +       loff_t *index_ptr = NULL;
21844 +       loff_t index = *pos;
21845 +       struct hist_data *my_hist = m->private;
21846 +
21847 +       if (index == 0) {
21848 +               char minstr[32], avgstr[32], maxstr[32];
21849 +
21850 +               atomic_dec(&my_hist->hist_mode);
21851 +
21852 +               if (likely(my_hist->total_samples)) {
21853 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
21854 +                           my_hist->total_samples);
21855 +                       snprintf(minstr, sizeof(minstr), "%ld",
21856 +                           my_hist->min_lat - my_hist->offset);
21857 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
21858 +                           avg - my_hist->offset);
21859 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
21860 +                           my_hist->max_lat - my_hist->offset);
21861 +               } else {
21862 +                       strcpy(minstr, "<undef>");
21863 +                       strcpy(avgstr, minstr);
21864 +                       strcpy(maxstr, minstr);
21865 +               }
21866 +
21867 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
21868 +                          "#Average latency: %s microseconds\n"
21869 +                          "#Maximum latency: %s microseconds\n"
21870 +                          "#Total samples: %llu\n"
21871 +                          "#There are %llu samples lower than %ld"
21872 +                          " microseconds.\n"
21873 +                          "#There are %llu samples greater or equal"
21874 +                          " than %ld microseconds.\n"
21875 +                          "#usecs\t%16s\n",
21876 +                          minstr, avgstr, maxstr,
21877 +                          my_hist->total_samples,
21878 +                          my_hist->below_hist_bound_samples,
21879 +                          -my_hist->offset,
21880 +                          my_hist->above_hist_bound_samples,
21881 +                          MAX_ENTRY_NUM - my_hist->offset,
21882 +                          "samples");
21883 +       }
21884 +       if (index < MAX_ENTRY_NUM) {
21885 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
21886 +               if (index_ptr)
21887 +                       *index_ptr = index;
21888 +       }
21889 +
21890 +       return index_ptr;
21891 +}
21892 +
21893 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
21894 +{
21895 +       loff_t *index_ptr = p;
21896 +       struct hist_data *my_hist = m->private;
21897 +
21898 +       if (++*pos >= MAX_ENTRY_NUM) {
21899 +               atomic_inc(&my_hist->hist_mode);
21900 +               return NULL;
21901 +       }
21902 +       *index_ptr = *pos;
21903 +       return index_ptr;
21904 +}
21905 +
21906 +static void l_stop(struct seq_file *m, void *p)
21907 +{
21908 +       kfree(p);
21909 +}
21910 +
21911 +static int l_show(struct seq_file *m, void *p)
21912 +{
21913 +       int index = *(loff_t *) p;
21914 +       struct hist_data *my_hist = m->private;
21915 +
21916 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
21917 +           my_hist->hist_array[index]);
21918 +       return 0;
21919 +}
21920 +
21921 +static const struct seq_operations latency_hist_seq_op = {
21922 +       .start = l_start,
21923 +       .next  = l_next,
21924 +       .stop  = l_stop,
21925 +       .show  = l_show
21926 +};
21927 +
21928 +static int latency_hist_open(struct inode *inode, struct file *file)
21929 +{
21930 +       int ret;
21931 +
21932 +       ret = seq_open(file, &latency_hist_seq_op);
21933 +       if (!ret) {
21934 +               struct seq_file *seq = file->private_data;
21935 +               seq->private = inode->i_private;
21936 +       }
21937 +       return ret;
21938 +}
21939 +
21940 +static const struct file_operations latency_hist_fops = {
21941 +       .open = latency_hist_open,
21942 +       .read = seq_read,
21943 +       .llseek = seq_lseek,
21944 +       .release = seq_release,
21945 +};
21946 +
21947 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21948 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21949 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
21950 +{
21951 +       mp->comm[0] = mp->current_comm[0] = '\0';
21952 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
21953 +           mp->latency = mp->timeroffset = -1;
21954 +       mp->timestamp = 0;
21955 +}
21956 +#endif
21957 +
21958 +static void hist_reset(struct hist_data *hist)
21959 +{
21960 +       atomic_dec(&hist->hist_mode);
21961 +
21962 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
21963 +       hist->below_hist_bound_samples = 0ULL;
21964 +       hist->above_hist_bound_samples = 0ULL;
21965 +       hist->min_lat = LONG_MAX;
21966 +       hist->max_lat = LONG_MIN;
21967 +       hist->total_samples = 0ULL;
21968 +       hist->accumulate_lat = 0LL;
21969 +
21970 +       atomic_inc(&hist->hist_mode);
21971 +}
21972 +
21973 +static ssize_t
21974 +latency_hist_reset(struct file *file, const char __user *a,
21975 +                  size_t size, loff_t *off)
21976 +{
21977 +       int cpu;
21978 +       struct hist_data *hist = NULL;
21979 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21980 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21981 +       struct maxlatproc_data *mp = NULL;
21982 +#endif
21983 +       off_t latency_type = (off_t) file->private_data;
21984 +
21985 +       for_each_online_cpu(cpu) {
21986 +
21987 +               switch (latency_type) {
21988 +#ifdef CONFIG_PREEMPT_OFF_HIST
21989 +               case PREEMPTOFF_LATENCY:
21990 +                       hist = &per_cpu(preemptoff_hist, cpu);
21991 +                       break;
21992 +#endif
21993 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21994 +               case IRQSOFF_LATENCY:
21995 +                       hist = &per_cpu(irqsoff_hist, cpu);
21996 +                       break;
21997 +#endif
21998 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21999 +               case PREEMPTIRQSOFF_LATENCY:
22000 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
22001 +                       break;
22002 +#endif
22003 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22004 +               case WAKEUP_LATENCY:
22005 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
22006 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
22007 +                       break;
22008 +               case WAKEUP_LATENCY_SHAREDPRIO:
22009 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
22010 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
22011 +                       break;
22012 +#endif
22013 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22014 +               case MISSED_TIMER_OFFSETS:
22015 +                       hist = &per_cpu(missed_timer_offsets, cpu);
22016 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
22017 +                       break;
22018 +#endif
22019 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22020 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22021 +               case TIMERANDWAKEUP_LATENCY:
22022 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
22023 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
22024 +                       break;
22025 +#endif
22026 +               }
22027 +
22028 +               hist_reset(hist);
22029 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22030 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22031 +               if (latency_type == WAKEUP_LATENCY ||
22032 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
22033 +                   latency_type == MISSED_TIMER_OFFSETS ||
22034 +                   latency_type == TIMERANDWAKEUP_LATENCY)
22035 +                       clear_maxlatprocdata(mp);
22036 +#endif
22037 +       }
22038 +
22039 +       return size;
22040 +}
22041 +
22042 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22043 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22044 +static ssize_t
22045 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22046 +{
22047 +       char buf[64];
22048 +       int r;
22049 +       unsigned long *this_pid = file->private_data;
22050 +
22051 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
22052 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22053 +}
22054 +
22055 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
22056 +                     size_t cnt, loff_t *ppos)
22057 +{
22058 +       char buf[64];
22059 +       unsigned long pid;
22060 +       unsigned long *this_pid = file->private_data;
22061 +
22062 +       if (cnt >= sizeof(buf))
22063 +               return -EINVAL;
22064 +
22065 +       if (copy_from_user(&buf, ubuf, cnt))
22066 +               return -EFAULT;
22067 +
22068 +       buf[cnt] = '\0';
22069 +
22070 +       if (kstrtoul(buf, 10, &pid))
22071 +               return -EINVAL;
22072 +
22073 +       *this_pid = pid;
22074 +
22075 +       return cnt;
22076 +}
22077 +#endif
22078 +
22079 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22080 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22081 +static ssize_t
22082 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22083 +{
22084 +       int r;
22085 +       struct maxlatproc_data *mp = file->private_data;
22086 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
22087 +       unsigned long long t;
22088 +       unsigned long usecs, secs;
22089 +       char *buf;
22090 +
22091 +       if (mp->pid == -1 || mp->current_pid == -1) {
22092 +               buf = "(none)\n";
22093 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
22094 +                   strlen(buf));
22095 +       }
22096 +
22097 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
22098 +       if (buf == NULL)
22099 +               return -ENOMEM;
22100 +
22101 +       t = ns2usecs(mp->timestamp);
22102 +       usecs = do_div(t, USEC_PER_SEC);
22103 +       secs = (unsigned long) t;
22104 +       r = snprintf(buf, strmaxlen,
22105 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
22106 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
22107 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
22108 +           secs, usecs);
22109 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22110 +       kfree(buf);
22111 +       return r;
22112 +}
22113 +#endif
22114 +
22115 +static ssize_t
22116 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
22117 +{
22118 +       char buf[64];
22119 +       struct enable_data *ed = file->private_data;
22120 +       int r;
22121 +
22122 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
22123 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
22124 +}
22125 +
22126 +static ssize_t
22127 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
22128 +{
22129 +       char buf[64];
22130 +       long enable;
22131 +       struct enable_data *ed = file->private_data;
22132 +
22133 +       if (cnt >= sizeof(buf))
22134 +               return -EINVAL;
22135 +
22136 +       if (copy_from_user(&buf, ubuf, cnt))
22137 +               return -EFAULT;
22138 +
22139 +       buf[cnt] = 0;
22140 +
22141 +       if (kstrtoul(buf, 10, &enable))
22142 +               return -EINVAL;
22143 +
22144 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
22145 +               return cnt;
22146 +
22147 +       if (enable) {
22148 +               int ret;
22149 +
22150 +               switch (ed->latency_type) {
22151 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22152 +               case PREEMPTIRQSOFF_LATENCY:
22153 +                       ret = register_trace_preemptirqsoff_hist(
22154 +                           probe_preemptirqsoff_hist, NULL);
22155 +                       if (ret) {
22156 +                               pr_info("wakeup trace: Couldn't assign "
22157 +                                   "probe_preemptirqsoff_hist "
22158 +                                   "to trace_preemptirqsoff_hist\n");
22159 +                               return ret;
22160 +                       }
22161 +                       break;
22162 +#endif
22163 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22164 +               case WAKEUP_LATENCY:
22165 +                       ret = register_trace_sched_wakeup(
22166 +                           probe_wakeup_latency_hist_start, NULL);
22167 +                       if (ret) {
22168 +                               pr_info("wakeup trace: Couldn't assign "
22169 +                                   "probe_wakeup_latency_hist_start "
22170 +                                   "to trace_sched_wakeup\n");
22171 +                               return ret;
22172 +                       }
22173 +                       ret = register_trace_sched_wakeup_new(
22174 +                           probe_wakeup_latency_hist_start, NULL);
22175 +                       if (ret) {
22176 +                               pr_info("wakeup trace: Couldn't assign "
22177 +                                   "probe_wakeup_latency_hist_start "
22178 +                                   "to trace_sched_wakeup_new\n");
22179 +                               unregister_trace_sched_wakeup(
22180 +                                   probe_wakeup_latency_hist_start, NULL);
22181 +                               return ret;
22182 +                       }
22183 +                       ret = register_trace_sched_switch(
22184 +                           probe_wakeup_latency_hist_stop, NULL);
22185 +                       if (ret) {
22186 +                               pr_info("wakeup trace: Couldn't assign "
22187 +                                   "probe_wakeup_latency_hist_stop "
22188 +                                   "to trace_sched_switch\n");
22189 +                               unregister_trace_sched_wakeup(
22190 +                                   probe_wakeup_latency_hist_start, NULL);
22191 +                               unregister_trace_sched_wakeup_new(
22192 +                                   probe_wakeup_latency_hist_start, NULL);
22193 +                               return ret;
22194 +                       }
22195 +                       ret = register_trace_sched_migrate_task(
22196 +                           probe_sched_migrate_task, NULL);
22197 +                       if (ret) {
22198 +                               pr_info("wakeup trace: Couldn't assign "
22199 +                                   "probe_sched_migrate_task "
22200 +                                   "to trace_sched_migrate_task\n");
22201 +                               unregister_trace_sched_wakeup(
22202 +                                   probe_wakeup_latency_hist_start, NULL);
22203 +                               unregister_trace_sched_wakeup_new(
22204 +                                   probe_wakeup_latency_hist_start, NULL);
22205 +                               unregister_trace_sched_switch(
22206 +                                   probe_wakeup_latency_hist_stop, NULL);
22207 +                               return ret;
22208 +                       }
22209 +                       break;
22210 +#endif
22211 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22212 +               case MISSED_TIMER_OFFSETS:
22213 +                       ret = register_trace_hrtimer_interrupt(
22214 +                           probe_hrtimer_interrupt, NULL);
22215 +                       if (ret) {
22216 +                               pr_info("wakeup trace: Couldn't assign "
22217 +                                   "probe_hrtimer_interrupt "
22218 +                                   "to trace_hrtimer_interrupt\n");
22219 +                               return ret;
22220 +                       }
22221 +                       break;
22222 +#endif
22223 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22224 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22225 +               case TIMERANDWAKEUP_LATENCY:
22226 +                       if (!wakeup_latency_enabled_data.enabled ||
22227 +                           !missed_timer_offsets_enabled_data.enabled)
22228 +                               return -EINVAL;
22229 +                       break;
22230 +#endif
22231 +               default:
22232 +                       break;
22233 +               }
22234 +       } else {
22235 +               switch (ed->latency_type) {
22236 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22237 +               case PREEMPTIRQSOFF_LATENCY:
22238 +                       {
22239 +                               int cpu;
22240 +
22241 +                               unregister_trace_preemptirqsoff_hist(
22242 +                                   probe_preemptirqsoff_hist, NULL);
22243 +                               for_each_online_cpu(cpu) {
22244 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22245 +                                       per_cpu(hist_irqsoff_counting,
22246 +                                           cpu) = 0;
22247 +#endif
22248 +#ifdef CONFIG_PREEMPT_OFF_HIST
22249 +                                       per_cpu(hist_preemptoff_counting,
22250 +                                           cpu) = 0;
22251 +#endif
22252 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22253 +                                       per_cpu(hist_preemptirqsoff_counting,
22254 +                                           cpu) = 0;
22255 +#endif
22256 +                               }
22257 +                       }
22258 +                       break;
22259 +#endif
22260 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22261 +               case WAKEUP_LATENCY:
22262 +                       {
22263 +                               int cpu;
22264 +
22265 +                               unregister_trace_sched_wakeup(
22266 +                                   probe_wakeup_latency_hist_start, NULL);
22267 +                               unregister_trace_sched_wakeup_new(
22268 +                                   probe_wakeup_latency_hist_start, NULL);
22269 +                               unregister_trace_sched_switch(
22270 +                                   probe_wakeup_latency_hist_stop, NULL);
22271 +                               unregister_trace_sched_migrate_task(
22272 +                                   probe_sched_migrate_task, NULL);
22273 +
22274 +                               for_each_online_cpu(cpu) {
22275 +                                       per_cpu(wakeup_task, cpu) = NULL;
22276 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
22277 +                               }
22278 +                       }
22279 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22280 +                       timerandwakeup_enabled_data.enabled = 0;
22281 +#endif
22282 +                       break;
22283 +#endif
22284 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22285 +               case MISSED_TIMER_OFFSETS:
22286 +                       unregister_trace_hrtimer_interrupt(
22287 +                           probe_hrtimer_interrupt, NULL);
22288 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22289 +                       timerandwakeup_enabled_data.enabled = 0;
22290 +#endif
22291 +                       break;
22292 +#endif
22293 +               default:
22294 +                       break;
22295 +               }
22296 +       }
22297 +       ed->enabled = enable;
22298 +       return cnt;
22299 +}
22300 +
22301 +static const struct file_operations latency_hist_reset_fops = {
22302 +       .open = tracing_open_generic,
22303 +       .write = latency_hist_reset,
22304 +};
22305 +
22306 +static const struct file_operations enable_fops = {
22307 +       .open = tracing_open_generic,
22308 +       .read = show_enable,
22309 +       .write = do_enable,
22310 +};
22311 +
22312 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22313 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22314 +static const struct file_operations pid_fops = {
22315 +       .open = tracing_open_generic,
22316 +       .read = show_pid,
22317 +       .write = do_pid,
22318 +};
22319 +
22320 +static const struct file_operations maxlatproc_fops = {
22321 +       .open = tracing_open_generic,
22322 +       .read = show_maxlatproc,
22323 +};
22324 +#endif
22325 +
22326 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22327 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
22328 +       int starthist)
22329 +{
22330 +       int cpu = raw_smp_processor_id();
22331 +       int time_set = 0;
22332 +
22333 +       if (starthist) {
22334 +               cycle_t uninitialized_var(start);
22335 +
22336 +               if (!preempt_count() && !irqs_disabled())
22337 +                       return;
22338 +
22339 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22340 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
22341 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
22342 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
22343 +                       start = ftrace_now(cpu);
22344 +                       time_set++;
22345 +                       per_cpu(hist_irqsoff_start, cpu) = start;
22346 +               }
22347 +#endif
22348 +
22349 +#ifdef CONFIG_PREEMPT_OFF_HIST
22350 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
22351 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
22352 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
22353 +                       if (!(time_set++))
22354 +                               start = ftrace_now(cpu);
22355 +                       per_cpu(hist_preemptoff_start, cpu) = start;
22356 +               }
22357 +#endif
22358 +
22359 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22360 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
22361 +                   per_cpu(hist_preemptoff_counting, cpu) &&
22362 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
22363 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
22364 +                       if (!time_set)
22365 +                               start = ftrace_now(cpu);
22366 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
22367 +               }
22368 +#endif
22369 +       } else {
22370 +               cycle_t uninitialized_var(stop);
22371 +
22372 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22373 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
22374 +                   per_cpu(hist_irqsoff_counting, cpu)) {
22375 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
22376 +
22377 +                       stop = ftrace_now(cpu);
22378 +                       time_set++;
22379 +                       if (start) {
22380 +                               long latency = ((long) (stop - start)) /
22381 +                                   NSECS_PER_USECS;
22382 +
22383 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
22384 +                                   stop, NULL);
22385 +                       }
22386 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
22387 +               }
22388 +#endif
22389 +
22390 +#ifdef CONFIG_PREEMPT_OFF_HIST
22391 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
22392 +                   per_cpu(hist_preemptoff_counting, cpu)) {
22393 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
22394 +
22395 +                       if (!(time_set++))
22396 +                               stop = ftrace_now(cpu);
22397 +                       if (start) {
22398 +                               long latency = ((long) (stop - start)) /
22399 +                                   NSECS_PER_USECS;
22400 +
22401 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
22402 +                                   0, stop, NULL);
22403 +                       }
22404 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
22405 +               }
22406 +#endif
22407 +
22408 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22409 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
22410 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
22411 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
22412 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
22413 +
22414 +                       if (!time_set)
22415 +                               stop = ftrace_now(cpu);
22416 +                       if (start) {
22417 +                               long latency = ((long) (stop - start)) /
22418 +                                   NSECS_PER_USECS;
22419 +
22420 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
22421 +                                   latency, 0, stop, NULL);
22422 +                       }
22423 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
22424 +               }
22425 +#endif
22426 +       }
22427 +}
22428 +#endif
22429 +
22430 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22431 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
22432 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
22433 +       int cpu)
22434 +{
22435 +       int old_cpu = task_cpu(task);
22436 +
22437 +       if (cpu != old_cpu) {
22438 +               unsigned long flags;
22439 +               struct task_struct *cpu_wakeup_task;
22440 +
22441 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
22442 +
22443 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
22444 +               if (task == cpu_wakeup_task) {
22445 +                       put_task_struct(cpu_wakeup_task);
22446 +                       per_cpu(wakeup_task, old_cpu) = NULL;
22447 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
22448 +                       get_task_struct(cpu_wakeup_task);
22449 +               }
22450 +
22451 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
22452 +       }
22453 +}
22454 +
22455 +static notrace void probe_wakeup_latency_hist_start(void *v,
22456 +       struct task_struct *p)
22457 +{
22458 +       unsigned long flags;
22459 +       struct task_struct *curr = current;
22460 +       int cpu = task_cpu(p);
22461 +       struct task_struct *cpu_wakeup_task;
22462 +
22463 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
22464 +
22465 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
22466 +
22467 +       if (wakeup_pid) {
22468 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
22469 +                   p->prio == curr->prio)
22470 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
22471 +               if (likely(wakeup_pid != task_pid_nr(p)))
22472 +                       goto out;
22473 +       } else {
22474 +               if (likely(!rt_task(p)) ||
22475 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
22476 +                   p->prio > curr->prio)
22477 +                       goto out;
22478 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
22479 +                   p->prio == curr->prio)
22480 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
22481 +       }
22482 +
22483 +       if (cpu_wakeup_task)
22484 +               put_task_struct(cpu_wakeup_task);
22485 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
22486 +       get_task_struct(cpu_wakeup_task);
22487 +       cpu_wakeup_task->preempt_timestamp_hist =
22488 +               ftrace_now(raw_smp_processor_id());
22489 +out:
22490 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
22491 +}
22492 +
22493 +static notrace void probe_wakeup_latency_hist_stop(void *v,
22494 +       bool preempt, struct task_struct *prev, struct task_struct *next)
22495 +{
22496 +       unsigned long flags;
22497 +       int cpu = task_cpu(next);
22498 +       long latency;
22499 +       cycle_t stop;
22500 +       struct task_struct *cpu_wakeup_task;
22501 +
22502 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
22503 +
22504 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
22505 +
22506 +       if (cpu_wakeup_task == NULL)
22507 +               goto out;
22508 +
22509 +       /* Already running? */
22510 +       if (unlikely(current == cpu_wakeup_task))
22511 +               goto out_reset;
22512 +
22513 +       if (next != cpu_wakeup_task) {
22514 +               if (next->prio < cpu_wakeup_task->prio)
22515 +                       goto out_reset;
22516 +
22517 +               if (next->prio == cpu_wakeup_task->prio)
22518 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
22519 +
22520 +               goto out;
22521 +       }
22522 +
22523 +       if (current->prio == cpu_wakeup_task->prio)
22524 +               per_cpu(wakeup_sharedprio, cpu) = 1;
22525 +
22526 +       /*
22527 +        * The task we are waiting for is about to be switched to.
22528 +        * Calculate latency and store it in histogram.
22529 +        */
22530 +       stop = ftrace_now(raw_smp_processor_id());
22531 +
22532 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
22533 +           NSECS_PER_USECS;
22534 +
22535 +       if (per_cpu(wakeup_sharedprio, cpu)) {
22536 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
22537 +                   next);
22538 +               per_cpu(wakeup_sharedprio, cpu) = 0;
22539 +       } else {
22540 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
22541 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22542 +               if (timerandwakeup_enabled_data.enabled) {
22543 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
22544 +                           next->timer_offset + latency, next->timer_offset,
22545 +                           stop, next);
22546 +               }
22547 +#endif
22548 +       }
22549 +
22550 +out_reset:
22551 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22552 +       next->timer_offset = 0;
22553 +#endif
22554 +       put_task_struct(cpu_wakeup_task);
22555 +       per_cpu(wakeup_task, cpu) = NULL;
22556 +out:
22557 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
22558 +}
22559 +#endif
22560 +
22561 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22562 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
22563 +       long long latency_ns, struct task_struct *curr,
22564 +       struct task_struct *task)
22565 +{
22566 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
22567 +           (task->prio < curr->prio ||
22568 +           (task->prio == curr->prio &&
22569 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
22570 +               long latency;
22571 +               cycle_t now;
22572 +
22573 +               if (missed_timer_offsets_pid) {
22574 +                       if (likely(missed_timer_offsets_pid !=
22575 +                           task_pid_nr(task)))
22576 +                               return;
22577 +               }
22578 +
22579 +               now = ftrace_now(cpu);
22580 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
22581 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
22582 +                   task);
22583 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22584 +               task->timer_offset = latency;
22585 +#endif
22586 +       }
22587 +}
22588 +#endif
22589 +
22590 +static __init int latency_hist_init(void)
22591 +{
22592 +       struct dentry *latency_hist_root = NULL;
22593 +       struct dentry *dentry;
22594 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22595 +       struct dentry *dentry_sharedprio;
22596 +#endif
22597 +       struct dentry *entry;
22598 +       struct dentry *enable_root;
22599 +       int i = 0;
22600 +       struct hist_data *my_hist;
22601 +       char name[64];
22602 +       char *cpufmt = "CPU%d";
22603 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
22604 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22605 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
22606 +       struct maxlatproc_data *mp = NULL;
22607 +#endif
22608 +
22609 +       dentry = tracing_init_dentry();
22610 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
22611 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
22612 +
22613 +#ifdef CONFIG_INTERRUPT_OFF_HIST
22614 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
22615 +       for_each_possible_cpu(i) {
22616 +               sprintf(name, cpufmt, i);
22617 +               entry = debugfs_create_file(name, 0444, dentry,
22618 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
22619 +               my_hist = &per_cpu(irqsoff_hist, i);
22620 +               atomic_set(&my_hist->hist_mode, 1);
22621 +               my_hist->min_lat = LONG_MAX;
22622 +       }
22623 +       entry = debugfs_create_file("reset", 0644, dentry,
22624 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
22625 +#endif
22626 +
22627 +#ifdef CONFIG_PREEMPT_OFF_HIST
22628 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
22629 +           latency_hist_root);
22630 +       for_each_possible_cpu(i) {
22631 +               sprintf(name, cpufmt, i);
22632 +               entry = debugfs_create_file(name, 0444, dentry,
22633 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
22634 +               my_hist = &per_cpu(preemptoff_hist, i);
22635 +               atomic_set(&my_hist->hist_mode, 1);
22636 +               my_hist->min_lat = LONG_MAX;
22637 +       }
22638 +       entry = debugfs_create_file("reset", 0644, dentry,
22639 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
22640 +#endif
22641 +
22642 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
22643 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
22644 +           latency_hist_root);
22645 +       for_each_possible_cpu(i) {
22646 +               sprintf(name, cpufmt, i);
22647 +               entry = debugfs_create_file(name, 0444, dentry,
22648 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
22649 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
22650 +               atomic_set(&my_hist->hist_mode, 1);
22651 +               my_hist->min_lat = LONG_MAX;
22652 +       }
22653 +       entry = debugfs_create_file("reset", 0644, dentry,
22654 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
22655 +#endif
22656 +
22657 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
22658 +       entry = debugfs_create_file("preemptirqsoff", 0644,
22659 +           enable_root, (void *)&preemptirqsoff_enabled_data,
22660 +           &enable_fops);
22661 +#endif
22662 +
22663 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
22664 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
22665 +           latency_hist_root);
22666 +       dentry_sharedprio = debugfs_create_dir(
22667 +           wakeup_latency_hist_dir_sharedprio, dentry);
22668 +       for_each_possible_cpu(i) {
22669 +               sprintf(name, cpufmt, i);
22670 +
22671 +               entry = debugfs_create_file(name, 0444, dentry,
22672 +                   &per_cpu(wakeup_latency_hist, i),
22673 +                   &latency_hist_fops);
22674 +               my_hist = &per_cpu(wakeup_latency_hist, i);
22675 +               atomic_set(&my_hist->hist_mode, 1);
22676 +               my_hist->min_lat = LONG_MAX;
22677 +
22678 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
22679 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
22680 +                   &latency_hist_fops);
22681 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
22682 +               atomic_set(&my_hist->hist_mode, 1);
22683 +               my_hist->min_lat = LONG_MAX;
22684 +
22685 +               sprintf(name, cpufmt_maxlatproc, i);
22686 +
22687 +               mp = &per_cpu(wakeup_maxlatproc, i);
22688 +               entry = debugfs_create_file(name, 0444, dentry, mp,
22689 +                   &maxlatproc_fops);
22690 +               clear_maxlatprocdata(mp);
22691 +
22692 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
22693 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
22694 +                   &maxlatproc_fops);
22695 +               clear_maxlatprocdata(mp);
22696 +       }
22697 +       entry = debugfs_create_file("pid", 0644, dentry,
22698 +           (void *)&wakeup_pid, &pid_fops);
22699 +       entry = debugfs_create_file("reset", 0644, dentry,
22700 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
22701 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
22702 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
22703 +       entry = debugfs_create_file("wakeup", 0644,
22704 +           enable_root, (void *)&wakeup_latency_enabled_data,
22705 +           &enable_fops);
22706 +#endif
22707 +
22708 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
22709 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
22710 +           latency_hist_root);
22711 +       for_each_possible_cpu(i) {
22712 +               sprintf(name, cpufmt, i);
22713 +               entry = debugfs_create_file(name, 0444, dentry,
22714 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
22715 +               my_hist = &per_cpu(missed_timer_offsets, i);
22716 +               atomic_set(&my_hist->hist_mode, 1);
22717 +               my_hist->min_lat = LONG_MAX;
22718 +
22719 +               sprintf(name, cpufmt_maxlatproc, i);
22720 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
22721 +               entry = debugfs_create_file(name, 0444, dentry, mp,
22722 +                   &maxlatproc_fops);
22723 +               clear_maxlatprocdata(mp);
22724 +       }
22725 +       entry = debugfs_create_file("pid", 0644, dentry,
22726 +           (void *)&missed_timer_offsets_pid, &pid_fops);
22727 +       entry = debugfs_create_file("reset", 0644, dentry,
22728 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
22729 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
22730 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
22731 +           &enable_fops);
22732 +#endif
22733 +
22734 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
22735 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
22736 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
22737 +           latency_hist_root);
22738 +       for_each_possible_cpu(i) {
22739 +               sprintf(name, cpufmt, i);
22740 +               entry = debugfs_create_file(name, 0444, dentry,
22741 +                   &per_cpu(timerandwakeup_latency_hist, i),
22742 +                   &latency_hist_fops);
22743 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
22744 +               atomic_set(&my_hist->hist_mode, 1);
22745 +               my_hist->min_lat = LONG_MAX;
22746 +
22747 +               sprintf(name, cpufmt_maxlatproc, i);
22748 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
22749 +               entry = debugfs_create_file(name, 0444, dentry, mp,
22750 +                   &maxlatproc_fops);
22751 +               clear_maxlatprocdata(mp);
22752 +       }
22753 +       entry = debugfs_create_file("reset", 0644, dentry,
22754 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
22755 +       entry = debugfs_create_file("timerandwakeup", 0644,
22756 +           enable_root, (void *)&timerandwakeup_enabled_data,
22757 +           &enable_fops);
22758 +#endif
22759 +       return 0;
22760 +}
22761 +
22762 +device_initcall(latency_hist_init);
22763 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
22764 index 83c60f9013cb..6fb207964a84 100644
22765 --- a/kernel/trace/trace.c
22766 +++ b/kernel/trace/trace.c
22767 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
22768         struct task_struct *tsk = current;
22769
22770         entry->preempt_count            = pc & 0xff;
22771 +       entry->preempt_lazy_count       = preempt_lazy_count();
22772         entry->pid                      = (tsk) ? tsk->pid : 0;
22773         entry->flags =
22774  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
22775 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
22776                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
22777                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
22778                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
22779 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
22780 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
22781 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
22782                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
22783 +
22784 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
22785  }
22786  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
22787
22788 @@ -2892,14 +2896,17 @@ get_total_entries(struct trace_buffer *buf,
22789
22790  static void print_lat_help_header(struct seq_file *m)
22791  {
22792 -       seq_puts(m, "#                  _------=> CPU#            \n"
22793 -                   "#                 / _-----=> irqs-off        \n"
22794 -                   "#                | / _----=> need-resched    \n"
22795 -                   "#                || / _---=> hardirq/softirq \n"
22796 -                   "#                ||| / _--=> preempt-depth   \n"
22797 -                   "#                |||| /     delay            \n"
22798 -                   "#  cmd     pid   ||||| time  |   caller      \n"
22799 -                   "#     \\   /      |||||  \\    |   /         \n");
22800 +       seq_puts(m, "#                  _--------=> CPU#              \n"
22801 +                   "#                 / _-------=> irqs-off          \n"
22802 +                   "#                | / _------=> need-resched      \n"
22803 +                   "#                || / _-----=> need-resched_lazy \n"
22804 +                   "#                ||| / _----=> hardirq/softirq   \n"
22805 +                   "#                |||| / _---=> preempt-depth     \n"
22806 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
22807 +                   "#                |||||| / _-=> migrate-disable   \n"
22808 +                   "#                ||||||| /     delay             \n"
22809 +                   "# cmd     pid    |||||||| time   |  caller       \n"
22810 +                   "#     \\   /      ||||||||   \\    |  /            \n");
22811  }
22812
22813  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
22814 @@ -2925,11 +2932,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
22815         print_event_info(buf, m);
22816         seq_puts(m, "#                              _-----=> irqs-off\n"
22817                     "#                             / _----=> need-resched\n"
22818 -                   "#                            | / _---=> hardirq/softirq\n"
22819 -                   "#                            || / _--=> preempt-depth\n"
22820 -                   "#                            ||| /     delay\n"
22821 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
22822 -                   "#              | |       |   ||||       |         |\n");
22823 +                   "#                            |/  _-----=> need-resched_lazy\n"
22824 +                   "#                            || / _---=> hardirq/softirq\n"
22825 +                   "#                            ||| / _--=> preempt-depth\n"
22826 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
22827 +                   "#                            ||||| / _-=> migrate-disable   \n"
22828 +                   "#                            |||||| /    delay\n"
22829 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
22830 +                   "#              | |       |   |||||||      |         |\n");
22831  }
22832
22833  void
22834 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
22835 index b0d8576c27ae..702b9376b278 100644
22836 --- a/kernel/trace/trace.h
22837 +++ b/kernel/trace/trace.h
22838 @@ -124,6 +124,7 @@ struct kretprobe_trace_entry_head {
22839   *  NEED_RESCHED       - reschedule is requested
22840   *  HARDIRQ            - inside an interrupt handler
22841   *  SOFTIRQ            - inside a softirq handler
22842 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
22843   */
22844  enum trace_flag_type {
22845         TRACE_FLAG_IRQS_OFF             = 0x01,
22846 @@ -133,6 +134,7 @@ enum trace_flag_type {
22847         TRACE_FLAG_SOFTIRQ              = 0x10,
22848         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
22849         TRACE_FLAG_NMI                  = 0x40,
22850 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
22851  };
22852
22853  #define TRACE_BUF_SIZE         1024
22854 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
22855 index 03c0a48c3ac4..0b85d516b491 100644
22856 --- a/kernel/trace/trace_events.c
22857 +++ b/kernel/trace/trace_events.c
22858 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
22859         __common_field(unsigned char, flags);
22860         __common_field(unsigned char, preempt_count);
22861         __common_field(int, pid);
22862 +       __common_field(unsigned short, migrate_disable);
22863 +       __common_field(unsigned short, padding);
22864
22865         return ret;
22866  }
22867 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
22868 index 03cdff84d026..940bd10b4406 100644
22869 --- a/kernel/trace/trace_irqsoff.c
22870 +++ b/kernel/trace/trace_irqsoff.c
22871 @@ -13,6 +13,7 @@
22872  #include <linux/uaccess.h>
22873  #include <linux/module.h>
22874  #include <linux/ftrace.h>
22875 +#include <trace/events/hist.h>
22876
22877  #include "trace.h"
22878
22879 @@ -424,11 +425,13 @@ void start_critical_timings(void)
22880  {
22881         if (preempt_trace() || irq_trace())
22882                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22883 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
22884  }
22885  EXPORT_SYMBOL_GPL(start_critical_timings);
22886
22887  void stop_critical_timings(void)
22888  {
22889 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
22890         if (preempt_trace() || irq_trace())
22891                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22892  }
22893 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
22894  #ifdef CONFIG_PROVE_LOCKING
22895  void time_hardirqs_on(unsigned long a0, unsigned long a1)
22896  {
22897 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
22898         if (!preempt_trace() && irq_trace())
22899                 stop_critical_timing(a0, a1);
22900  }
22901 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
22902  {
22903         if (!preempt_trace() && irq_trace())
22904                 start_critical_timing(a0, a1);
22905 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
22906  }
22907
22908  #else /* !CONFIG_PROVE_LOCKING */
22909 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
22910   */
22911  void trace_hardirqs_on(void)
22912  {
22913 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
22914         if (!preempt_trace() && irq_trace())
22915                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22916  }
22917 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
22918  {
22919         if (!preempt_trace() && irq_trace())
22920                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
22921 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
22922  }
22923  EXPORT_SYMBOL(trace_hardirqs_off);
22924
22925  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
22926  {
22927 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
22928         if (!preempt_trace() && irq_trace())
22929                 stop_critical_timing(CALLER_ADDR0, caller_addr);
22930  }
22931 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
22932  {
22933         if (!preempt_trace() && irq_trace())
22934                 start_critical_timing(CALLER_ADDR0, caller_addr);
22935 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
22936  }
22937  EXPORT_SYMBOL(trace_hardirqs_off_caller);
22938
22939 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
22940  #ifdef CONFIG_PREEMPT_TRACER
22941  void trace_preempt_on(unsigned long a0, unsigned long a1)
22942  {
22943 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
22944         if (preempt_trace() && !irq_trace())
22945                 stop_critical_timing(a0, a1);
22946  }
22947
22948  void trace_preempt_off(unsigned long a0, unsigned long a1)
22949  {
22950 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
22951         if (preempt_trace() && !irq_trace())
22952                 start_critical_timing(a0, a1);
22953  }
22954 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
22955 index 3fc20422c166..65a6dde71a7d 100644
22956 --- a/kernel/trace/trace_output.c
22957 +++ b/kernel/trace/trace_output.c
22958 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
22959  {
22960         char hardsoft_irq;
22961         char need_resched;
22962 +       char need_resched_lazy;
22963         char irqs_off;
22964         int hardirq;
22965         int softirq;
22966 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
22967                 break;
22968         }
22969
22970 +       need_resched_lazy =
22971 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
22972 +
22973         hardsoft_irq =
22974                 (nmi && hardirq)     ? 'Z' :
22975                 nmi                  ? 'z' :
22976 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
22977                 softirq              ? 's' :
22978                                        '.' ;
22979
22980 -       trace_seq_printf(s, "%c%c%c",
22981 -                        irqs_off, need_resched, hardsoft_irq);
22982 +       trace_seq_printf(s, "%c%c%c%c",
22983 +                        irqs_off, need_resched, need_resched_lazy,
22984 +                        hardsoft_irq);
22985
22986         if (entry->preempt_count)
22987                 trace_seq_printf(s, "%x", entry->preempt_count);
22988         else
22989                 trace_seq_putc(s, '.');
22990
22991 +       if (entry->preempt_lazy_count)
22992 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
22993 +       else
22994 +               trace_seq_putc(s, '.');
22995 +
22996 +       if (entry->migrate_disable)
22997 +               trace_seq_printf(s, "%x", entry->migrate_disable);
22998 +       else
22999 +               trace_seq_putc(s, '.');
23000 +
23001         return !trace_seq_has_overflowed(s);
23002  }
23003
23004 diff --git a/kernel/user.c b/kernel/user.c
23005 index b069ccbfb0b0..1a2e88e98b5e 100644
23006 --- a/kernel/user.c
23007 +++ b/kernel/user.c
23008 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
23009         if (!up)
23010                 return;
23011
23012 -       local_irq_save(flags);
23013 +       local_irq_save_nort(flags);
23014         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
23015                 free_user(up, flags);
23016         else
23017 -               local_irq_restore(flags);
23018 +               local_irq_restore_nort(flags);
23019  }
23020
23021  struct user_struct *alloc_uid(kuid_t uid)
23022 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
23023 index 6d1020c03d41..70c6a2f79f7e 100644
23024 --- a/kernel/watchdog.c
23025 +++ b/kernel/watchdog.c
23026 @@ -315,6 +315,8 @@ static int is_softlockup(unsigned long touch_ts)
23027
23028  #ifdef CONFIG_HARDLOCKUP_DETECTOR
23029
23030 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
23031 +
23032  static struct perf_event_attr wd_hw_attr = {
23033         .type           = PERF_TYPE_HARDWARE,
23034         .config         = PERF_COUNT_HW_CPU_CYCLES,
23035 @@ -348,6 +350,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
23036                 /* only print hardlockups once */
23037                 if (__this_cpu_read(hard_watchdog_warn) == true)
23038                         return;
23039 +               /*
23040 +                * If early-printk is enabled then make sure we do not
23041 +                * lock up in printk() and kill console logging:
23042 +                */
23043 +               printk_kill();
23044 +
23045 +               raw_spin_lock(&watchdog_output_lock);
23046
23047                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
23048                 print_modules();
23049 @@ -365,6 +374,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
23050                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
23051                         trigger_allbutself_cpu_backtrace();
23052
23053 +               raw_spin_unlock(&watchdog_output_lock);
23054                 if (hardlockup_panic)
23055                         nmi_panic(regs, "Hard LOCKUP");
23056
23057 @@ -512,6 +522,7 @@ static void watchdog_enable(unsigned int cpu)
23058         /* kick off the timer for the hardlockup detector */
23059         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
23060         hrtimer->function = watchdog_timer_fn;
23061 +       hrtimer->irqsafe = 1;
23062
23063         /* Enable the perf event */
23064         watchdog_nmi_enable(cpu);
23065 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
23066 index 479d840db286..24eba6620a45 100644
23067 --- a/kernel/workqueue.c
23068 +++ b/kernel/workqueue.c
23069 @@ -48,6 +48,8 @@
23070  #include <linux/nodemask.h>
23071  #include <linux/moduleparam.h>
23072  #include <linux/uaccess.h>
23073 +#include <linux/locallock.h>
23074 +#include <linux/delay.h>
23075
23076  #include "workqueue_internal.h"
23077
23078 @@ -121,11 +123,16 @@ enum {
23079   *    cpu or grabbing pool->lock is enough for read access.  If
23080   *    POOL_DISASSOCIATED is set, it's identical to L.
23081   *
23082 + *    On RT we need the extra protection via rt_lock_idle_list() for
23083 + *    the list manipulations against read access from
23084 + *    wq_worker_sleeping(). All other places are nicely serialized via
23085 + *    pool->lock.
23086 + *
23087   * A: pool->attach_mutex protected.
23088   *
23089   * PL: wq_pool_mutex protected.
23090   *
23091 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
23092 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
23093   *
23094   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
23095   *
23096 @@ -134,7 +141,7 @@ enum {
23097   *
23098   * WQ: wq->mutex protected.
23099   *
23100 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
23101 + * WR: wq->mutex protected for writes.  RCU protected for reads.
23102   *
23103   * MD: wq_mayday_lock protected.
23104   */
23105 @@ -185,7 +192,7 @@ struct worker_pool {
23106         atomic_t                nr_running ____cacheline_aligned_in_smp;
23107
23108         /*
23109 -        * Destruction of pool is sched-RCU protected to allow dereferences
23110 +        * Destruction of pool is RCU protected to allow dereferences
23111          * from get_work_pool().
23112          */
23113         struct rcu_head         rcu;
23114 @@ -214,7 +221,7 @@ struct pool_workqueue {
23115         /*
23116          * Release of unbound pwq is punted to system_wq.  See put_pwq()
23117          * and pwq_unbound_release_workfn() for details.  pool_workqueue
23118 -        * itself is also sched-RCU protected so that the first pwq can be
23119 +        * itself is also RCU protected so that the first pwq can be
23120          * determined without grabbing wq->mutex.
23121          */
23122         struct work_struct      unbound_release_work;
23123 @@ -348,6 +355,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
23124  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
23125  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
23126
23127 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
23128 +
23129  static int worker_thread(void *__worker);
23130  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23131
23132 @@ -355,20 +364,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23133  #include <trace/events/workqueue.h>
23134
23135  #define assert_rcu_or_pool_mutex()                                     \
23136 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23137 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23138                          !lockdep_is_held(&wq_pool_mutex),              \
23139 -                        "sched RCU or wq_pool_mutex should be held")
23140 +                        "RCU or wq_pool_mutex should be held")
23141
23142  #define assert_rcu_or_wq_mutex(wq)                                     \
23143 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23144 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23145                          !lockdep_is_held(&wq->mutex),                  \
23146 -                        "sched RCU or wq->mutex should be held")
23147 +                        "RCU or wq->mutex should be held")
23148
23149  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
23150 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
23151 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
23152                          !lockdep_is_held(&wq->mutex) &&                \
23153                          !lockdep_is_held(&wq_pool_mutex),              \
23154 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
23155 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
23156
23157  #define for_each_cpu_worker_pool(pool, cpu)                            \
23158         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
23159 @@ -380,7 +389,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23160   * @pool: iteration cursor
23161   * @pi: integer used for iteration
23162   *
23163 - * This must be called either with wq_pool_mutex held or sched RCU read
23164 + * This must be called either with wq_pool_mutex held or RCU read
23165   * locked.  If the pool needs to be used beyond the locking in effect, the
23166   * caller is responsible for guaranteeing that the pool stays online.
23167   *
23168 @@ -412,7 +421,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23169   * @pwq: iteration cursor
23170   * @wq: the target workqueue
23171   *
23172 - * This must be called either with wq->mutex held or sched RCU read locked.
23173 + * This must be called either with wq->mutex held or RCU read locked.
23174   * If the pwq needs to be used beyond the locking in effect, the caller is
23175   * responsible for guaranteeing that the pwq stays online.
23176   *
23177 @@ -424,6 +433,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
23178                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
23179                 else
23180
23181 +#ifdef CONFIG_PREEMPT_RT_BASE
23182 +static inline void rt_lock_idle_list(struct worker_pool *pool)
23183 +{
23184 +       preempt_disable();
23185 +}
23186 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
23187 +{
23188 +       preempt_enable();
23189 +}
23190 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
23191 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
23192 +#else
23193 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
23194 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
23195 +static inline void sched_lock_idle_list(struct worker_pool *pool)
23196 +{
23197 +       spin_lock_irq(&pool->lock);
23198 +}
23199 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
23200 +{
23201 +       spin_unlock_irq(&pool->lock);
23202 +}
23203 +#endif
23204 +
23205 +
23206  #ifdef CONFIG_DEBUG_OBJECTS_WORK
23207
23208  static struct debug_obj_descr work_debug_descr;
23209 @@ -548,7 +582,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
23210   * @wq: the target workqueue
23211   * @node: the node ID
23212   *
23213 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
23214 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
23215   * read locked.
23216   * If the pwq needs to be used beyond the locking in effect, the caller is
23217   * responsible for guaranteeing that the pwq stays online.
23218 @@ -692,8 +726,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
23219   * @work: the work item of interest
23220   *
23221   * Pools are created and destroyed under wq_pool_mutex, and allows read
23222 - * access under sched-RCU read lock.  As such, this function should be
23223 - * called under wq_pool_mutex or with preemption disabled.
23224 + * access under RCU read lock.  As such, this function should be
23225 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
23226   *
23227   * All fields of the returned pool are accessible as long as the above
23228   * mentioned locking is in effect.  If the returned pool needs to be used
23229 @@ -830,50 +864,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
23230   */
23231  static void wake_up_worker(struct worker_pool *pool)
23232  {
23233 -       struct worker *worker = first_idle_worker(pool);
23234 +       struct worker *worker;
23235 +
23236 +       rt_lock_idle_list(pool);
23237 +
23238 +       worker = first_idle_worker(pool);
23239
23240         if (likely(worker))
23241                 wake_up_process(worker->task);
23242 +
23243 +       rt_unlock_idle_list(pool);
23244  }
23245
23246  /**
23247 - * wq_worker_waking_up - a worker is waking up
23248 + * wq_worker_running - a worker is running again
23249   * @task: task waking up
23250 - * @cpu: CPU @task is waking up to
23251   *
23252 - * This function is called during try_to_wake_up() when a worker is
23253 - * being awoken.
23254 - *
23255 - * CONTEXT:
23256 - * spin_lock_irq(rq->lock)
23257 + * This function is called when a worker returns from schedule()
23258   */
23259 -void wq_worker_waking_up(struct task_struct *task, int cpu)
23260 +void wq_worker_running(struct task_struct *task)
23261  {
23262         struct worker *worker = kthread_data(task);
23263
23264 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
23265 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
23266 +       if (!worker->sleeping)
23267 +               return;
23268 +       if (!(worker->flags & WORKER_NOT_RUNNING))
23269                 atomic_inc(&worker->pool->nr_running);
23270 -       }
23271 +       worker->sleeping = 0;
23272  }
23273
23274  /**
23275   * wq_worker_sleeping - a worker is going to sleep
23276   * @task: task going to sleep
23277   *
23278 - * This function is called during schedule() when a busy worker is
23279 - * going to sleep.  Worker on the same cpu can be woken up by
23280 - * returning pointer to its task.
23281 - *
23282 - * CONTEXT:
23283 - * spin_lock_irq(rq->lock)
23284 - *
23285 - * Return:
23286 - * Worker task on @cpu to wake up, %NULL if none.
23287 + * This function is called from schedule() when a busy worker is
23288 + * going to sleep.
23289   */
23290 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
23291 +void wq_worker_sleeping(struct task_struct *task)
23292  {
23293 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
23294 +       struct worker *worker = kthread_data(task);
23295         struct worker_pool *pool;
23296
23297         /*
23298 @@ -882,29 +911,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
23299          * checking NOT_RUNNING.
23300          */
23301         if (worker->flags & WORKER_NOT_RUNNING)
23302 -               return NULL;
23303 +               return;
23304
23305         pool = worker->pool;
23306
23307 -       /* this can only happen on the local cpu */
23308 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
23309 -               return NULL;
23310 +       if (WARN_ON_ONCE(worker->sleeping))
23311 +               return;
23312 +
23313 +       worker->sleeping = 1;
23314
23315         /*
23316          * The counterpart of the following dec_and_test, implied mb,
23317          * worklist not empty test sequence is in insert_work().
23318          * Please read comment there.
23319 -        *
23320 -        * NOT_RUNNING is clear.  This means that we're bound to and
23321 -        * running on the local cpu w/ rq lock held and preemption
23322 -        * disabled, which in turn means that none else could be
23323 -        * manipulating idle_list, so dereferencing idle_list without pool
23324 -        * lock is safe.
23325          */
23326         if (atomic_dec_and_test(&pool->nr_running) &&
23327 -           !list_empty(&pool->worklist))
23328 -               to_wakeup = first_idle_worker(pool);
23329 -       return to_wakeup ? to_wakeup->task : NULL;
23330 +           !list_empty(&pool->worklist)) {
23331 +               sched_lock_idle_list(pool);
23332 +               wake_up_worker(pool);
23333 +               sched_unlock_idle_list(pool);
23334 +       }
23335  }
23336
23337  /**
23338 @@ -1098,12 +1124,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
23339  {
23340         if (pwq) {
23341                 /*
23342 -                * As both pwqs and pools are sched-RCU protected, the
23343 +                * As both pwqs and pools are RCU protected, the
23344                  * following lock operations are safe.
23345                  */
23346 -               spin_lock_irq(&pwq->pool->lock);
23347 +               rcu_read_lock();
23348 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
23349                 put_pwq(pwq);
23350 -               spin_unlock_irq(&pwq->pool->lock);
23351 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
23352 +               rcu_read_unlock();
23353         }
23354  }
23355
23356 @@ -1207,7 +1235,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23357         struct worker_pool *pool;
23358         struct pool_workqueue *pwq;
23359
23360 -       local_irq_save(*flags);
23361 +       local_lock_irqsave(pendingb_lock, *flags);
23362
23363         /* try to steal the timer if it exists */
23364         if (is_dwork) {
23365 @@ -1226,6 +1254,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23366         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
23367                 return 0;
23368
23369 +       rcu_read_lock();
23370         /*
23371          * The queueing is in progress, or it is already queued. Try to
23372          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
23373 @@ -1264,14 +1293,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
23374                 set_work_pool_and_keep_pending(work, pool->id);
23375
23376                 spin_unlock(&pool->lock);
23377 +               rcu_read_unlock();
23378                 return 1;
23379         }
23380         spin_unlock(&pool->lock);
23381  fail:
23382 -       local_irq_restore(*flags);
23383 +       rcu_read_unlock();
23384 +       local_unlock_irqrestore(pendingb_lock, *flags);
23385         if (work_is_canceling(work))
23386                 return -ENOENT;
23387 -       cpu_relax();
23388 +       cpu_chill();
23389         return -EAGAIN;
23390  }
23391
23392 @@ -1373,7 +1404,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
23393          * queued or lose PENDING.  Grabbing PENDING and queueing should
23394          * happen with IRQ disabled.
23395          */
23396 -       WARN_ON_ONCE(!irqs_disabled());
23397 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
23398
23399         debug_work_activate(work);
23400
23401 @@ -1381,6 +1412,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
23402         if (unlikely(wq->flags & __WQ_DRAINING) &&
23403             WARN_ON_ONCE(!is_chained_work(wq)))
23404                 return;
23405 +       rcu_read_lock();
23406  retry:
23407         if (req_cpu == WORK_CPU_UNBOUND)
23408                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
23409 @@ -1437,10 +1469,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
23410         /* pwq determined, queue */
23411         trace_workqueue_queue_work(req_cpu, pwq, work);
23412
23413 -       if (WARN_ON(!list_empty(&work->entry))) {
23414 -               spin_unlock(&pwq->pool->lock);
23415 -               return;
23416 -       }
23417 +       if (WARN_ON(!list_empty(&work->entry)))
23418 +               goto out;
23419
23420         pwq->nr_in_flight[pwq->work_color]++;
23421         work_flags = work_color_to_flags(pwq->work_color);
23422 @@ -1458,7 +1488,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
23423
23424         insert_work(pwq, work, worklist, work_flags);
23425
23426 +out:
23427         spin_unlock(&pwq->pool->lock);
23428 +       rcu_read_unlock();
23429  }
23430
23431  /**
23432 @@ -1478,14 +1510,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
23433         bool ret = false;
23434         unsigned long flags;
23435
23436 -       local_irq_save(flags);
23437 +       local_lock_irqsave(pendingb_lock,flags);
23438
23439         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
23440                 __queue_work(cpu, wq, work);
23441                 ret = true;
23442         }
23443
23444 -       local_irq_restore(flags);
23445 +       local_unlock_irqrestore(pendingb_lock, flags);
23446         return ret;
23447  }
23448  EXPORT_SYMBOL(queue_work_on);
23449 @@ -1552,14 +1584,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
23450         unsigned long flags;
23451
23452         /* read the comment in __queue_work() */
23453 -       local_irq_save(flags);
23454 +       local_lock_irqsave(pendingb_lock, flags);
23455
23456         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
23457                 __queue_delayed_work(cpu, wq, dwork, delay);
23458                 ret = true;
23459         }
23460
23461 -       local_irq_restore(flags);
23462 +       local_unlock_irqrestore(pendingb_lock, flags);
23463         return ret;
23464  }
23465  EXPORT_SYMBOL(queue_delayed_work_on);
23466 @@ -1594,7 +1626,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
23467
23468         if (likely(ret >= 0)) {
23469                 __queue_delayed_work(cpu, wq, dwork, delay);
23470 -               local_irq_restore(flags);
23471 +               local_unlock_irqrestore(pendingb_lock, flags);
23472         }
23473
23474         /* -ENOENT from try_to_grab_pending() becomes %true */
23475 @@ -1627,7 +1659,9 @@ static void worker_enter_idle(struct worker *worker)
23476         worker->last_active = jiffies;
23477
23478         /* idle_list is LIFO */
23479 +       rt_lock_idle_list(pool);
23480         list_add(&worker->entry, &pool->idle_list);
23481 +       rt_unlock_idle_list(pool);
23482
23483         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
23484                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
23485 @@ -1660,7 +1694,9 @@ static void worker_leave_idle(struct worker *worker)
23486                 return;
23487         worker_clr_flags(worker, WORKER_IDLE);
23488         pool->nr_idle--;
23489 +       rt_lock_idle_list(pool);
23490         list_del_init(&worker->entry);
23491 +       rt_unlock_idle_list(pool);
23492  }
23493
23494  static struct worker *alloc_worker(int node)
23495 @@ -1826,7 +1862,9 @@ static void destroy_worker(struct worker *worker)
23496         pool->nr_workers--;
23497         pool->nr_idle--;
23498
23499 +       rt_lock_idle_list(pool);
23500         list_del_init(&worker->entry);
23501 +       rt_unlock_idle_list(pool);
23502         worker->flags |= WORKER_DIE;
23503         wake_up_process(worker->task);
23504  }
23505 @@ -2785,14 +2823,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
23506
23507         might_sleep();
23508
23509 -       local_irq_disable();
23510 +       rcu_read_lock();
23511         pool = get_work_pool(work);
23512         if (!pool) {
23513 -               local_irq_enable();
23514 +               rcu_read_unlock();
23515                 return false;
23516         }
23517
23518 -       spin_lock(&pool->lock);
23519 +       spin_lock_irq(&pool->lock);
23520         /* see the comment in try_to_grab_pending() with the same code */
23521         pwq = get_work_pwq(work);
23522         if (pwq) {
23523 @@ -2821,10 +2859,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
23524         else
23525                 lock_map_acquire_read(&pwq->wq->lockdep_map);
23526         lock_map_release(&pwq->wq->lockdep_map);
23527 -
23528 +       rcu_read_unlock();
23529         return true;
23530  already_gone:
23531         spin_unlock_irq(&pool->lock);
23532 +       rcu_read_unlock();
23533         return false;
23534  }
23535
23536 @@ -2911,7 +2950,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
23537
23538         /* tell other tasks trying to grab @work to back off */
23539         mark_work_canceling(work);
23540 -       local_irq_restore(flags);
23541 +       local_unlock_irqrestore(pendingb_lock, flags);
23542
23543         flush_work(work);
23544         clear_work_data(work);
23545 @@ -2966,10 +3005,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
23546   */
23547  bool flush_delayed_work(struct delayed_work *dwork)
23548  {
23549 -       local_irq_disable();
23550 +       local_lock_irq(pendingb_lock);
23551         if (del_timer_sync(&dwork->timer))
23552                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
23553 -       local_irq_enable();
23554 +       local_unlock_irq(pendingb_lock);
23555         return flush_work(&dwork->work);
23556  }
23557  EXPORT_SYMBOL(flush_delayed_work);
23558 @@ -2987,7 +3026,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
23559                 return false;
23560
23561         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
23562 -       local_irq_restore(flags);
23563 +       local_unlock_irqrestore(pendingb_lock, flags);
23564         return ret;
23565  }
23566
23567 @@ -3245,7 +3284,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
23568   * put_unbound_pool - put a worker_pool
23569   * @pool: worker_pool to put
23570   *
23571 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
23572 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
23573   * safe manner.  get_unbound_pool() calls this function on its failure path
23574   * and this function should be able to release pools which went through,
23575   * successfully or not, init_worker_pool().
23576 @@ -3299,8 +3338,8 @@ static void put_unbound_pool(struct worker_pool *pool)
23577         del_timer_sync(&pool->idle_timer);
23578         del_timer_sync(&pool->mayday_timer);
23579
23580 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
23581 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
23582 +       /* RCU protected to allow dereferences from get_work_pool() */
23583 +       call_rcu(&pool->rcu, rcu_free_pool);
23584  }
23585
23586  /**
23587 @@ -3407,14 +3446,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
23588         put_unbound_pool(pool);
23589         mutex_unlock(&wq_pool_mutex);
23590
23591 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
23592 +       call_rcu(&pwq->rcu, rcu_free_pwq);
23593
23594         /*
23595          * If we're the last pwq going away, @wq is already dead and no one
23596          * is gonna access it anymore.  Schedule RCU free.
23597          */
23598         if (is_last)
23599 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
23600 +               call_rcu(&wq->rcu, rcu_free_wq);
23601  }
23602
23603  /**
23604 @@ -4064,7 +4103,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
23605                  * The base ref is never dropped on per-cpu pwqs.  Directly
23606                  * schedule RCU free.
23607                  */
23608 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
23609 +               call_rcu(&wq->rcu, rcu_free_wq);
23610         } else {
23611                 /*
23612                  * We're the sole accessor of @wq at this point.  Directly
23613 @@ -4157,7 +4196,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
23614         struct pool_workqueue *pwq;
23615         bool ret;
23616
23617 -       rcu_read_lock_sched();
23618 +       rcu_read_lock();
23619 +       preempt_disable();
23620
23621         if (cpu == WORK_CPU_UNBOUND)
23622                 cpu = smp_processor_id();
23623 @@ -4168,7 +4208,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
23624                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
23625
23626         ret = !list_empty(&pwq->delayed_works);
23627 -       rcu_read_unlock_sched();
23628 +       preempt_enable();
23629 +       rcu_read_unlock();
23630
23631         return ret;
23632  }
23633 @@ -4194,15 +4235,15 @@ unsigned int work_busy(struct work_struct *work)
23634         if (work_pending(work))
23635                 ret |= WORK_BUSY_PENDING;
23636
23637 -       local_irq_save(flags);
23638 +       rcu_read_lock();
23639         pool = get_work_pool(work);
23640         if (pool) {
23641 -               spin_lock(&pool->lock);
23642 +               spin_lock_irqsave(&pool->lock, flags);
23643                 if (find_worker_executing_work(pool, work))
23644                         ret |= WORK_BUSY_RUNNING;
23645 -               spin_unlock(&pool->lock);
23646 +               spin_unlock_irqrestore(&pool->lock, flags);
23647         }
23648 -       local_irq_restore(flags);
23649 +       rcu_read_unlock();
23650
23651         return ret;
23652  }
23653 @@ -4391,7 +4432,7 @@ void show_workqueue_state(void)
23654         unsigned long flags;
23655         int pi;
23656
23657 -       rcu_read_lock_sched();
23658 +       rcu_read_lock();
23659
23660         pr_info("Showing busy workqueues and worker pools:\n");
23661
23662 @@ -4444,7 +4485,7 @@ void show_workqueue_state(void)
23663                 spin_unlock_irqrestore(&pool->lock, flags);
23664         }
23665
23666 -       rcu_read_unlock_sched();
23667 +       rcu_read_unlock();
23668  }
23669
23670  /*
23671 @@ -4782,16 +4823,16 @@ bool freeze_workqueues_busy(void)
23672                  * nr_active is monotonically decreasing.  It's safe
23673                  * to peek without lock.
23674                  */
23675 -               rcu_read_lock_sched();
23676 +               rcu_read_lock();
23677                 for_each_pwq(pwq, wq) {
23678                         WARN_ON_ONCE(pwq->nr_active < 0);
23679                         if (pwq->nr_active) {
23680                                 busy = true;
23681 -                               rcu_read_unlock_sched();
23682 +                               rcu_read_unlock();
23683                                 goto out_unlock;
23684                         }
23685                 }
23686 -               rcu_read_unlock_sched();
23687 +               rcu_read_unlock();
23688         }
23689  out_unlock:
23690         mutex_unlock(&wq_pool_mutex);
23691 @@ -4981,7 +5022,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
23692         const char *delim = "";
23693         int node, written = 0;
23694
23695 -       rcu_read_lock_sched();
23696 +       get_online_cpus();
23697 +       rcu_read_lock();
23698         for_each_node(node) {
23699                 written += scnprintf(buf + written, PAGE_SIZE - written,
23700                                      "%s%d:%d", delim, node,
23701 @@ -4989,7 +5031,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
23702                 delim = " ";
23703         }
23704         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
23705 -       rcu_read_unlock_sched();
23706 +       rcu_read_unlock();
23707 +       put_online_cpus();
23708
23709         return written;
23710  }
23711 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
23712 index 8635417c587b..f000c4d6917e 100644
23713 --- a/kernel/workqueue_internal.h
23714 +++ b/kernel/workqueue_internal.h
23715 @@ -43,6 +43,7 @@ struct worker {
23716         unsigned long           last_active;    /* L: last active timestamp */
23717         unsigned int            flags;          /* X: flags */
23718         int                     id;             /* I: worker id */
23719 +       int                     sleeping;       /* None */
23720
23721         /*
23722          * Opaque string set with work_set_desc().  Printed out with task
23723 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
23724   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
23725   * sched/core.c and workqueue.c.
23726   */
23727 -void wq_worker_waking_up(struct task_struct *task, int cpu);
23728 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
23729 +void wq_worker_running(struct task_struct *task);
23730 +void wq_worker_sleeping(struct task_struct *task);
23731
23732  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
23733 diff --git a/lib/Kconfig b/lib/Kconfig
23734 index 260a80e313b9..b06becb3f477 100644
23735 --- a/lib/Kconfig
23736 +++ b/lib/Kconfig
23737 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
23738
23739  config CPUMASK_OFFSTACK
23740         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
23741 +       depends on !PREEMPT_RT_FULL
23742         help
23743           Use dynamic allocation for cpumask_var_t, instead of putting
23744           them on the stack.  This is a bit more expensive, but avoids
23745 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
23746 index 056052dc8e91..d8494e126de8 100644
23747 --- a/lib/debugobjects.c
23748 +++ b/lib/debugobjects.c
23749 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
23750         struct debug_obj *obj;
23751         unsigned long flags;
23752
23753 -       fill_pool();
23754 +#ifdef CONFIG_PREEMPT_RT_FULL
23755 +       if (preempt_count() == 0 && !irqs_disabled())
23756 +#endif
23757 +               fill_pool();
23758
23759         db = get_bucket((unsigned long) addr);
23760
23761 diff --git a/lib/idr.c b/lib/idr.c
23762 index 6098336df267..9decbe914595 100644
23763 --- a/lib/idr.c
23764 +++ b/lib/idr.c
23765 @@ -30,6 +30,7 @@
23766  #include <linux/idr.h>
23767  #include <linux/spinlock.h>
23768  #include <linux/percpu.h>
23769 +#include <linux/locallock.h>
23770
23771  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
23772  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
23773 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
23774  static DEFINE_PER_CPU(int, idr_preload_cnt);
23775  static DEFINE_SPINLOCK(simple_ida_lock);
23776
23777 +#ifdef CONFIG_PREEMPT_RT_FULL
23778 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
23779 +
23780 +static inline void idr_preload_lock(void)
23781 +{
23782 +       local_lock(idr_lock);
23783 +}
23784 +
23785 +static inline void idr_preload_unlock(void)
23786 +{
23787 +       local_unlock(idr_lock);
23788 +}
23789 +
23790 +void idr_preload_end(void)
23791 +{
23792 +       idr_preload_unlock();
23793 +}
23794 +EXPORT_SYMBOL(idr_preload_end);
23795 +#else
23796 +static inline void idr_preload_lock(void)
23797 +{
23798 +       preempt_disable();
23799 +}
23800 +
23801 +static inline void idr_preload_unlock(void)
23802 +{
23803 +       preempt_enable();
23804 +}
23805 +#endif
23806 +
23807 +
23808  /* the maximum ID which can be allocated given idr->layers */
23809  static int idr_max(int layers)
23810  {
23811 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
23812          * context.  See idr_preload() for details.
23813          */
23814         if (!in_interrupt()) {
23815 -               preempt_disable();
23816 +               idr_preload_lock();
23817                 new = __this_cpu_read(idr_preload_head);
23818                 if (new) {
23819                         __this_cpu_write(idr_preload_head, new->ary[0]);
23820                         __this_cpu_dec(idr_preload_cnt);
23821                         new->ary[0] = NULL;
23822                 }
23823 -               preempt_enable();
23824 +               idr_preload_unlock();
23825                 if (new)
23826                         return new;
23827         }
23828 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
23829         idr_mark_full(pa, id);
23830  }
23831
23832 -
23833  /**
23834   * idr_preload - preload for idr_alloc()
23835   * @gfp_mask: allocation mask to use for preloading
23836 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
23837         WARN_ON_ONCE(in_interrupt());
23838         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
23839
23840 -       preempt_disable();
23841 +       idr_preload_lock();
23842
23843         /*
23844          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
23845 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
23846         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
23847                 struct idr_layer *new;
23848
23849 -               preempt_enable();
23850 +               idr_preload_unlock();
23851                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
23852 -               preempt_disable();
23853 +               idr_preload_lock();
23854                 if (!new)
23855                         break;
23856
23857 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
23858 index 1d6565e81030..b23a79761df7 100644
23859 --- a/lib/irq_poll.c
23860 +++ b/lib/irq_poll.c
23861 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
23862         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
23863         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
23864         local_irq_restore(flags);
23865 +       preempt_check_resched_rt();
23866  }
23867  EXPORT_SYMBOL(irq_poll_sched);
23868
23869 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
23870         local_irq_save(flags);
23871         __irq_poll_complete(iop);
23872         local_irq_restore(flags);
23873 +       preempt_check_resched_rt();
23874  }
23875  EXPORT_SYMBOL(irq_poll_complete);
23876
23877 @@ -95,6 +97,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
23878                 }
23879
23880                 local_irq_enable();
23881 +               preempt_check_resched_rt();
23882
23883                 /* Even though interrupts have been re-enabled, this
23884                  * access is safe because interrupts can only add new
23885 @@ -132,6 +135,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
23886                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
23887
23888         local_irq_enable();
23889 +       preempt_check_resched_rt();
23890  }
23891
23892  /**
23893 @@ -195,6 +199,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
23894                          this_cpu_ptr(&blk_cpu_iopoll));
23895         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
23896         local_irq_enable();
23897 +       preempt_check_resched_rt();
23898
23899         return 0;
23900  }
23901 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
23902 index f3a217ea0388..4611b156ef79 100644
23903 --- a/lib/locking-selftest.c
23904 +++ b/lib/locking-selftest.c
23905 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
23906  #include "locking-selftest-spin-hardirq.h"
23907  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
23908
23909 +#ifndef CONFIG_PREEMPT_RT_FULL
23910 +
23911  #include "locking-selftest-rlock-hardirq.h"
23912  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
23913
23914 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
23915  #include "locking-selftest-wlock-softirq.h"
23916  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
23917
23918 +#endif
23919 +
23920  #undef E1
23921  #undef E2
23922
23923 +#ifndef CONFIG_PREEMPT_RT_FULL
23924  /*
23925   * Enabling hardirqs with a softirq-safe lock held:
23926   */
23927 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
23928  #undef E1
23929  #undef E2
23930
23931 +#endif
23932 +
23933  /*
23934   * Enabling irqs with an irq-safe lock held:
23935   */
23936 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
23937  #include "locking-selftest-spin-hardirq.h"
23938  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
23939
23940 +#ifndef CONFIG_PREEMPT_RT_FULL
23941 +
23942  #include "locking-selftest-rlock-hardirq.h"
23943  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
23944
23945 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
23946  #include "locking-selftest-wlock-softirq.h"
23947  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
23948
23949 +#endif
23950 +
23951  #undef E1
23952  #undef E2
23953
23954 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
23955  #include "locking-selftest-spin-hardirq.h"
23956  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
23957
23958 +#ifndef CONFIG_PREEMPT_RT_FULL
23959 +
23960  #include "locking-selftest-rlock-hardirq.h"
23961  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
23962
23963 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
23964  #include "locking-selftest-wlock-softirq.h"
23965  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
23966
23967 +#endif
23968 +
23969  #undef E1
23970  #undef E2
23971  #undef E3
23972 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
23973  #include "locking-selftest-spin-hardirq.h"
23974  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
23975
23976 +#ifndef CONFIG_PREEMPT_RT_FULL
23977 +
23978  #include "locking-selftest-rlock-hardirq.h"
23979  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
23980
23981 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
23982  #include "locking-selftest-wlock-softirq.h"
23983  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
23984
23985 +#endif
23986 +
23987  #undef E1
23988  #undef E2
23989  #undef E3
23990
23991 +#ifndef CONFIG_PREEMPT_RT_FULL
23992 +
23993  /*
23994   * read-lock / write-lock irq inversion.
23995   *
23996 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
23997  #undef E2
23998  #undef E3
23999
24000 +#endif
24001 +
24002 +#ifndef CONFIG_PREEMPT_RT_FULL
24003 +
24004  /*
24005   * read-lock / write-lock recursion that is actually safe.
24006   */
24007 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
24008  #undef E2
24009  #undef E3
24010
24011 +#endif
24012 +
24013  /*
24014   * read-lock / write-lock recursion that is unsafe.
24015   */
24016 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
24017
24018         printk("  --------------------------------------------------------------------------\n");
24019
24020 +#ifndef CONFIG_PREEMPT_RT_FULL
24021         /*
24022          * irq-context testcases:
24023          */
24024 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
24025
24026         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
24027  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
24028 +#else
24029 +       /* On -rt, we only do hardirq context test for raw spinlock */
24030 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
24031 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
24032 +
24033 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
24034 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
24035 +
24036 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
24037 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
24038 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
24039 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
24040 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
24041 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
24042 +
24043 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
24044 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
24045 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
24046 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
24047 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
24048 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
24049 +#endif
24050
24051         ww_tests();
24052
24053 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
24054 index 6d40944960de..822a2c027e72 100644
24055 --- a/lib/percpu_ida.c
24056 +++ b/lib/percpu_ida.c
24057 @@ -26,6 +26,9 @@
24058  #include <linux/string.h>
24059  #include <linux/spinlock.h>
24060  #include <linux/percpu_ida.h>
24061 +#include <linux/locallock.h>
24062 +
24063 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
24064
24065  struct percpu_ida_cpu {
24066         /*
24067 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24068         unsigned long flags;
24069         int tag;
24070
24071 -       local_irq_save(flags);
24072 +       local_lock_irqsave(irq_off_lock, flags);
24073         tags = this_cpu_ptr(pool->tag_cpu);
24074
24075         /* Fastpath */
24076         tag = alloc_local_tag(tags);
24077         if (likely(tag >= 0)) {
24078 -               local_irq_restore(flags);
24079 +               local_unlock_irqrestore(irq_off_lock, flags);
24080                 return tag;
24081         }
24082
24083 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24084
24085                 if (!tags->nr_free)
24086                         alloc_global_tags(pool, tags);
24087 +
24088                 if (!tags->nr_free)
24089                         steal_tags(pool, tags);
24090
24091 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24092                 }
24093
24094                 spin_unlock(&pool->lock);
24095 -               local_irq_restore(flags);
24096 +               local_unlock_irqrestore(irq_off_lock, flags);
24097
24098                 if (tag >= 0 || state == TASK_RUNNING)
24099                         break;
24100 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
24101
24102                 schedule();
24103
24104 -               local_irq_save(flags);
24105 +               local_lock_irqsave(irq_off_lock, flags);
24106                 tags = this_cpu_ptr(pool->tag_cpu);
24107         }
24108         if (state != TASK_RUNNING)
24109 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
24110
24111         BUG_ON(tag >= pool->nr_tags);
24112
24113 -       local_irq_save(flags);
24114 +       local_lock_irqsave(irq_off_lock, flags);
24115         tags = this_cpu_ptr(pool->tag_cpu);
24116
24117         spin_lock(&tags->lock);
24118 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
24119                 spin_unlock(&pool->lock);
24120         }
24121
24122 -       local_irq_restore(flags);
24123 +       local_unlock_irqrestore(irq_off_lock, flags);
24124  }
24125  EXPORT_SYMBOL_GPL(percpu_ida_free);
24126
24127 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
24128         struct percpu_ida_cpu *remote;
24129         unsigned cpu, i, err = 0;
24130
24131 -       local_irq_save(flags);
24132 +       local_lock_irqsave(irq_off_lock, flags);
24133         for_each_possible_cpu(cpu) {
24134                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
24135                 spin_lock(&remote->lock);
24136 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
24137         }
24138         spin_unlock(&pool->lock);
24139  out:
24140 -       local_irq_restore(flags);
24141 +       local_unlock_irqrestore(irq_off_lock, flags);
24142         return err;
24143  }
24144  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
24145 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
24146 index 8e6d552c40dd..741da5a77fd5 100644
24147 --- a/lib/radix-tree.c
24148 +++ b/lib/radix-tree.c
24149 @@ -36,7 +36,7 @@
24150  #include <linux/bitops.h>
24151  #include <linux/rcupdate.h>
24152  #include <linux/preempt.h>             /* in_interrupt() */
24153 -
24154 +#include <linux/locallock.h>
24155
24156  /* Number of nodes in fully populated tree of given height */
24157  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
24158 @@ -68,6 +68,7 @@ struct radix_tree_preload {
24159         struct radix_tree_node *nodes;
24160  };
24161  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
24162 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
24163
24164  static inline void *node_to_entry(void *ptr)
24165  {
24166 @@ -290,13 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
24167                  * succeed in getting a node here (and never reach
24168                  * kmem_cache_alloc)
24169                  */
24170 -               rtp = this_cpu_ptr(&radix_tree_preloads);
24171 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
24172                 if (rtp->nr) {
24173                         ret = rtp->nodes;
24174                         rtp->nodes = ret->private_data;
24175                         ret->private_data = NULL;
24176                         rtp->nr--;
24177                 }
24178 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
24179                 /*
24180                  * Update the allocation stack trace as this is more useful
24181                  * for debugging.
24182 @@ -357,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
24183          */
24184         gfp_mask &= ~__GFP_ACCOUNT;
24185
24186 -       preempt_disable();
24187 +       local_lock(radix_tree_preloads_lock);
24188         rtp = this_cpu_ptr(&radix_tree_preloads);
24189         while (rtp->nr < nr) {
24190 -               preempt_enable();
24191 +               local_unlock(radix_tree_preloads_lock);
24192                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
24193                 if (node == NULL)
24194                         goto out;
24195 -               preempt_disable();
24196 +               local_lock(radix_tree_preloads_lock);
24197                 rtp = this_cpu_ptr(&radix_tree_preloads);
24198                 if (rtp->nr < nr) {
24199                         node->private_data = rtp->nodes;
24200 @@ -406,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
24201         if (gfpflags_allow_blocking(gfp_mask))
24202                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
24203         /* Preloading doesn't help anything with this gfp mask, skip it */
24204 -       preempt_disable();
24205 +       local_lock(radix_tree_preloads_lock);
24206         return 0;
24207  }
24208  EXPORT_SYMBOL(radix_tree_maybe_preload);
24209 @@ -422,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
24210
24211         /* Preloading doesn't help anything with this gfp mask, skip it */
24212         if (!gfpflags_allow_blocking(gfp_mask)) {
24213 -               preempt_disable();
24214 +               local_lock(radix_tree_preloads_lock);
24215                 return 0;
24216         }
24217
24218 @@ -456,6 +458,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
24219         return __radix_tree_preload(gfp_mask, nr_nodes);
24220  }
24221
24222 +void radix_tree_preload_end(void)
24223 +{
24224 +       local_unlock(radix_tree_preloads_lock);
24225 +}
24226 +EXPORT_SYMBOL(radix_tree_preload_end);
24227 +
24228  /*
24229   * The maximum index which can be stored in a radix tree
24230   */
24231 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
24232 index 004fc70fc56a..ccc46992a517 100644
24233 --- a/lib/scatterlist.c
24234 +++ b/lib/scatterlist.c
24235 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
24236                         flush_kernel_dcache_page(miter->page);
24237
24238                 if (miter->__flags & SG_MITER_ATOMIC) {
24239 -                       WARN_ON_ONCE(preemptible());
24240 +                       WARN_ON_ONCE(!pagefault_disabled());
24241                         kunmap_atomic(miter->addr);
24242                 } else
24243                         kunmap(miter->page);
24244 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
24245         if (!sg_miter_skip(&miter, skip))
24246                 return false;
24247
24248 -       local_irq_save(flags);
24249 +       local_irq_save_nort(flags);
24250
24251         while (sg_miter_next(&miter) && offset < buflen) {
24252                 unsigned int len;
24253 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
24254
24255         sg_miter_stop(&miter);
24256
24257 -       local_irq_restore(flags);
24258 +       local_irq_restore_nort(flags);
24259         return offset;
24260  }
24261  EXPORT_SYMBOL(sg_copy_buffer);
24262 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
24263 index 1afec32de6f2..11fa431046a8 100644
24264 --- a/lib/smp_processor_id.c
24265 +++ b/lib/smp_processor_id.c
24266 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
24267         if (!printk_ratelimit())
24268                 goto out_enable;
24269
24270 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
24271 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
24272 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
24273 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
24274 +               current->comm, current->pid);
24275
24276         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
24277         dump_stack();
24278 diff --git a/localversion-rt b/localversion-rt
24279 new file mode 100644
24280 index 000000000000..9e7cd66d9f44
24281 --- /dev/null
24282 +++ b/localversion-rt
24283 @@ -0,0 +1 @@
24284 +-rt18
24285 diff --git a/mm/Kconfig b/mm/Kconfig
24286 index 86e3e0e74d20..77e5862a1ed2 100644
24287 --- a/mm/Kconfig
24288 +++ b/mm/Kconfig
24289 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
24290
24291  config TRANSPARENT_HUGEPAGE
24292         bool "Transparent Hugepage Support"
24293 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
24294 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
24295         select COMPACTION
24296         select RADIX_TREE_MULTIORDER
24297         help
24298 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
24299 index 6ff2d7744223..b5a91dd53b5f 100644
24300 --- a/mm/backing-dev.c
24301 +++ b/mm/backing-dev.c
24302 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
24303  {
24304         unsigned long flags;
24305
24306 -       local_irq_save(flags);
24307 +       local_irq_save_nort(flags);
24308         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
24309 -               local_irq_restore(flags);
24310 +               local_irq_restore_nort(flags);
24311                 return;
24312         }
24313
24314 diff --git a/mm/compaction.c b/mm/compaction.c
24315 index 70e6bec46dc2..6678ed58b7c6 100644
24316 --- a/mm/compaction.c
24317 +++ b/mm/compaction.c
24318 @@ -1593,10 +1593,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
24319                                 block_start_pfn(cc->migrate_pfn, cc->order);
24320
24321                         if (cc->last_migrated_pfn < current_block_start) {
24322 -                               cpu = get_cpu();
24323 +                               cpu = get_cpu_light();
24324 +                               local_lock_irq(swapvec_lock);
24325                                 lru_add_drain_cpu(cpu);
24326 +                               local_unlock_irq(swapvec_lock);
24327                                 drain_local_pages(zone);
24328 -                               put_cpu();
24329 +                               put_cpu_light();
24330                                 /* No more flushing until we migrate again */
24331                                 cc->last_migrated_pfn = 0;
24332                         }
24333 diff --git a/mm/filemap.c b/mm/filemap.c
24334 index edfb90e3830c..a8d2c7a73d54 100644
24335 --- a/mm/filemap.c
24336 +++ b/mm/filemap.c
24337 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
24338                  * node->private_list is protected by
24339                  * mapping->tree_lock.
24340                  */
24341 -               if (!list_empty(&node->private_list))
24342 -                       list_lru_del(&workingset_shadow_nodes,
24343 +               if (!list_empty(&node->private_list)) {
24344 +                       local_lock(workingset_shadow_lock);
24345 +                       list_lru_del(&__workingset_shadow_nodes,
24346                                      &node->private_list);
24347 +                       local_unlock(workingset_shadow_lock);
24348 +               }
24349         }
24350         return 0;
24351  }
24352 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
24353                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
24354                                 list_empty(&node->private_list)) {
24355                         node->private_data = mapping;
24356 -                       list_lru_add(&workingset_shadow_nodes,
24357 -                                       &node->private_list);
24358 +                       local_lock(workingset_shadow_lock);
24359 +                       list_lru_add(&__workingset_shadow_nodes,
24360 +                                    &node->private_list);
24361 +                       local_unlock(workingset_shadow_lock);
24362                 }
24363         }
24364
24365 diff --git a/mm/highmem.c b/mm/highmem.c
24366 index 50b4ca6787f0..77518a3b35a1 100644
24367 --- a/mm/highmem.c
24368 +++ b/mm/highmem.c
24369 @@ -29,10 +29,11 @@
24370  #include <linux/kgdb.h>
24371  #include <asm/tlbflush.h>
24372
24373 -
24374 +#ifndef CONFIG_PREEMPT_RT_FULL
24375  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
24376  DEFINE_PER_CPU(int, __kmap_atomic_idx);
24377  #endif
24378 +#endif
24379
24380  /*
24381   * Virtual_count is not a pure "count".
24382 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
24383  unsigned long totalhigh_pages __read_mostly;
24384  EXPORT_SYMBOL(totalhigh_pages);
24385
24386 -
24387 +#ifndef CONFIG_PREEMPT_RT_FULL
24388  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
24389 +#endif
24390
24391  unsigned int nr_free_highpages (void)
24392  {
24393 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
24394 index 47559cc0cdcc..1f2ebc924916 100644
24395 --- a/mm/memcontrol.c
24396 +++ b/mm/memcontrol.c
24397 @@ -67,6 +67,7 @@
24398  #include <net/sock.h>
24399  #include <net/ip.h>
24400  #include "slab.h"
24401 +#include <linux/locallock.h>
24402
24403  #include <asm/uaccess.h>
24404
24405 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
24406  #define do_swap_account                0
24407  #endif
24408
24409 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
24410 +
24411  /* Whether legacy memory+swap accounting is active */
24412  static bool do_memsw_account(void)
24413  {
24414 @@ -1692,6 +1695,7 @@ struct memcg_stock_pcp {
24415  #define FLUSHING_CACHED_CHARGE 0
24416  };
24417  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
24418 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
24419  static DEFINE_MUTEX(percpu_charge_mutex);
24420
24421  /**
24422 @@ -1714,7 +1718,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
24423         if (nr_pages > CHARGE_BATCH)
24424                 return ret;
24425
24426 -       local_irq_save(flags);
24427 +       local_lock_irqsave(memcg_stock_ll, flags);
24428
24429         stock = this_cpu_ptr(&memcg_stock);
24430         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
24431 @@ -1722,7 +1726,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
24432                 ret = true;
24433         }
24434
24435 -       local_irq_restore(flags);
24436 +       local_unlock_irqrestore(memcg_stock_ll, flags);
24437
24438         return ret;
24439  }
24440 @@ -1749,13 +1753,13 @@ static void drain_local_stock(struct work_struct *dummy)
24441         struct memcg_stock_pcp *stock;
24442         unsigned long flags;
24443
24444 -       local_irq_save(flags);
24445 +       local_lock_irqsave(memcg_stock_ll, flags);
24446
24447         stock = this_cpu_ptr(&memcg_stock);
24448         drain_stock(stock);
24449         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
24450
24451 -       local_irq_restore(flags);
24452 +       local_unlock_irqrestore(memcg_stock_ll, flags);
24453  }
24454
24455  /*
24456 @@ -1767,7 +1771,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
24457         struct memcg_stock_pcp *stock;
24458         unsigned long flags;
24459
24460 -       local_irq_save(flags);
24461 +       local_lock_irqsave(memcg_stock_ll, flags);
24462
24463         stock = this_cpu_ptr(&memcg_stock);
24464         if (stock->cached != memcg) { /* reset if necessary */
24465 @@ -1776,7 +1780,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
24466         }
24467         stock->nr_pages += nr_pages;
24468
24469 -       local_irq_restore(flags);
24470 +       local_unlock_irqrestore(memcg_stock_ll, flags);
24471  }
24472
24473  /*
24474 @@ -1792,7 +1796,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
24475                 return;
24476         /* Notify other cpus that system-wide "drain" is running */
24477         get_online_cpus();
24478 -       curcpu = get_cpu();
24479 +       curcpu = get_cpu_light();
24480         for_each_online_cpu(cpu) {
24481                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
24482                 struct mem_cgroup *memcg;
24483 @@ -1809,7 +1813,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
24484                                 schedule_work_on(cpu, &stock->work);
24485                 }
24486         }
24487 -       put_cpu();
24488 +       put_cpu_light();
24489         put_online_cpus();
24490         mutex_unlock(&percpu_charge_mutex);
24491  }
24492 @@ -4555,12 +4559,12 @@ static int mem_cgroup_move_account(struct page *page,
24493
24494         ret = 0;
24495
24496 -       local_irq_disable();
24497 +       local_lock_irq(event_lock);
24498         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
24499         memcg_check_events(to, page);
24500         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
24501         memcg_check_events(from, page);
24502 -       local_irq_enable();
24503 +       local_unlock_irq(event_lock);
24504  out_unlock:
24505         unlock_page(page);
24506  out:
24507 @@ -5435,10 +5439,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
24508
24509         commit_charge(page, memcg, lrucare);
24510
24511 -       local_irq_disable();
24512 +       local_lock_irq(event_lock);
24513         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
24514         memcg_check_events(memcg, page);
24515 -       local_irq_enable();
24516 +       local_unlock_irq(event_lock);
24517
24518         if (do_memsw_account() && PageSwapCache(page)) {
24519                 swp_entry_t entry = { .val = page_private(page) };
24520 @@ -5494,14 +5498,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
24521                 memcg_oom_recover(memcg);
24522         }
24523
24524 -       local_irq_save(flags);
24525 +       local_lock_irqsave(event_lock, flags);
24526         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
24527         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
24528         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
24529         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
24530         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
24531         memcg_check_events(memcg, dummy_page);
24532 -       local_irq_restore(flags);
24533 +       local_unlock_irqrestore(event_lock, flags);
24534
24535         if (!mem_cgroup_is_root(memcg))
24536                 css_put_many(&memcg->css, nr_pages);
24537 @@ -5656,10 +5660,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
24538
24539         commit_charge(newpage, memcg, false);
24540
24541 -       local_irq_save(flags);
24542 +       local_lock_irqsave(event_lock, flags);
24543         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
24544         memcg_check_events(memcg, newpage);
24545 -       local_irq_restore(flags);
24546 +       local_unlock_irqrestore(event_lock, flags);
24547  }
24548
24549  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
24550 @@ -5850,6 +5854,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
24551  {
24552         struct mem_cgroup *memcg, *swap_memcg;
24553         unsigned short oldid;
24554 +       unsigned long flags;
24555
24556         VM_BUG_ON_PAGE(PageLRU(page), page);
24557         VM_BUG_ON_PAGE(page_count(page), page);
24558 @@ -5890,12 +5895,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
24559          * important here to have the interrupts disabled because it is the
24560          * only synchronisation we have for udpating the per-CPU variables.
24561          */
24562 +       local_lock_irqsave(event_lock, flags);
24563 +#ifndef CONFIG_PREEMPT_RT_BASE
24564         VM_BUG_ON(!irqs_disabled());
24565 +#endif
24566         mem_cgroup_charge_statistics(memcg, page, false, -1);
24567         memcg_check_events(memcg, page);
24568
24569         if (!mem_cgroup_is_root(memcg))
24570                 css_put(&memcg->css);
24571 +       local_unlock_irqrestore(event_lock, flags);
24572  }
24573
24574  /*
24575 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
24576 index 6f4d27c5bb32..5cd25c745a8f 100644
24577 --- a/mm/mmu_context.c
24578 +++ b/mm/mmu_context.c
24579 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
24580         struct task_struct *tsk = current;
24581
24582         task_lock(tsk);
24583 +       preempt_disable_rt();
24584         active_mm = tsk->active_mm;
24585         if (active_mm != mm) {
24586                 atomic_inc(&mm->mm_count);
24587 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
24588         }
24589         tsk->mm = mm;
24590         switch_mm(active_mm, mm, tsk);
24591 +       preempt_enable_rt();
24592         task_unlock(tsk);
24593  #ifdef finish_arch_post_lock_switch
24594         finish_arch_post_lock_switch();
24595 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
24596 index e5b159b88e39..b9946dcb1099 100644
24597 --- a/mm/page_alloc.c
24598 +++ b/mm/page_alloc.c
24599 @@ -61,6 +61,7 @@
24600  #include <linux/page_ext.h>
24601  #include <linux/hugetlb.h>
24602  #include <linux/sched/rt.h>
24603 +#include <linux/locallock.h>
24604  #include <linux/page_owner.h>
24605  #include <linux/kthread.h>
24606  #include <linux/memcontrol.h>
24607 @@ -281,6 +282,18 @@ EXPORT_SYMBOL(nr_node_ids);
24608  EXPORT_SYMBOL(nr_online_nodes);
24609  #endif
24610
24611 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
24612 +
24613 +#ifdef CONFIG_PREEMPT_RT_BASE
24614 +# define cpu_lock_irqsave(cpu, flags)          \
24615 +       local_lock_irqsave_on(pa_lock, flags, cpu)
24616 +# define cpu_unlock_irqrestore(cpu, flags)     \
24617 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
24618 +#else
24619 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
24620 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
24621 +#endif
24622 +
24623  int page_group_by_mobility_disabled __read_mostly;
24624
24625  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
24626 @@ -1072,7 +1085,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
24627  #endif /* CONFIG_DEBUG_VM */
24628
24629  /*
24630 - * Frees a number of pages from the PCP lists
24631 + * Frees a number of pages which have been collected from the pcp lists.
24632   * Assumes all pages on list are in same zone, and of same order.
24633   * count is the number of pages to free.
24634   *
24635 @@ -1083,19 +1096,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
24636   * pinned" detection logic.
24637   */
24638  static void free_pcppages_bulk(struct zone *zone, int count,
24639 -                                       struct per_cpu_pages *pcp)
24640 +                              struct list_head *list)
24641  {
24642 -       int migratetype = 0;
24643 -       int batch_free = 0;
24644         unsigned long nr_scanned;
24645         bool isolated_pageblocks;
24646 +       unsigned long flags;
24647 +
24648 +       spin_lock_irqsave(&zone->lock, flags);
24649
24650 -       spin_lock(&zone->lock);
24651         isolated_pageblocks = has_isolate_pageblock(zone);
24652         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
24653         if (nr_scanned)
24654                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
24655
24656 +       while (!list_empty(list)) {
24657 +               struct page *page;
24658 +               int mt; /* migratetype of the to-be-freed page */
24659 +
24660 +               page = list_first_entry(list, struct page, lru);
24661 +               /* must delete as __free_one_page list manipulates */
24662 +               list_del(&page->lru);
24663 +
24664 +               mt = get_pcppage_migratetype(page);
24665 +               /* MIGRATE_ISOLATE page should not go to pcplists */
24666 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
24667 +               /* Pageblock could have been isolated meanwhile */
24668 +               if (unlikely(isolated_pageblocks))
24669 +                       mt = get_pageblock_migratetype(page);
24670 +
24671 +               if (bulkfree_pcp_prepare(page))
24672 +                       continue;
24673 +
24674 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
24675 +               trace_mm_page_pcpu_drain(page, 0, mt);
24676 +               count--;
24677 +       }
24678 +       WARN_ON(count != 0);
24679 +       spin_unlock_irqrestore(&zone->lock, flags);
24680 +}
24681 +
24682 +/*
24683 + * Moves a number of pages from the PCP lists to free list which
24684 + * is freed outside of the locked region.
24685 + *
24686 + * Assumes all pages on list are in same zone, and of same order.
24687 + * count is the number of pages to free.
24688 + */
24689 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
24690 +                             struct list_head *dst)
24691 +{
24692 +       int migratetype = 0;
24693 +       int batch_free = 0;
24694 +
24695         while (count) {
24696                 struct page *page;
24697                 struct list_head *list;
24698 @@ -1111,7 +1163,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
24699                         batch_free++;
24700                         if (++migratetype == MIGRATE_PCPTYPES)
24701                                 migratetype = 0;
24702 -                       list = &pcp->lists[migratetype];
24703 +                       list = &src->lists[migratetype];
24704                 } while (list_empty(list));
24705
24706                 /* This is the only non-empty list. Free them all. */
24707 @@ -1119,27 +1171,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
24708                         batch_free = count;
24709
24710                 do {
24711 -                       int mt; /* migratetype of the to-be-freed page */
24712 -
24713                         page = list_last_entry(list, struct page, lru);
24714 -                       /* must delete as __free_one_page list manipulates */
24715                         list_del(&page->lru);
24716
24717 -                       mt = get_pcppage_migratetype(page);
24718 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
24719 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
24720 -                       /* Pageblock could have been isolated meanwhile */
24721 -                       if (unlikely(isolated_pageblocks))
24722 -                               mt = get_pageblock_migratetype(page);
24723 -
24724 -                       if (bulkfree_pcp_prepare(page))
24725 -                               continue;
24726 -
24727 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
24728 -                       trace_mm_page_pcpu_drain(page, 0, mt);
24729 +                       list_add(&page->lru, dst);
24730                 } while (--count && --batch_free && !list_empty(list));
24731         }
24732 -       spin_unlock(&zone->lock);
24733  }
24734
24735  static void free_one_page(struct zone *zone,
24736 @@ -1148,7 +1185,9 @@ static void free_one_page(struct zone *zone,
24737                                 int migratetype)
24738  {
24739         unsigned long nr_scanned;
24740 -       spin_lock(&zone->lock);
24741 +       unsigned long flags;
24742 +
24743 +       spin_lock_irqsave(&zone->lock, flags);
24744         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
24745         if (nr_scanned)
24746                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
24747 @@ -1158,7 +1197,7 @@ static void free_one_page(struct zone *zone,
24748                 migratetype = get_pfnblock_migratetype(page, pfn);
24749         }
24750         __free_one_page(page, pfn, zone, order, migratetype);
24751 -       spin_unlock(&zone->lock);
24752 +       spin_unlock_irqrestore(&zone->lock, flags);
24753  }
24754
24755  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
24756 @@ -1244,10 +1283,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
24757                 return;
24758
24759         migratetype = get_pfnblock_migratetype(page, pfn);
24760 -       local_irq_save(flags);
24761 +       local_lock_irqsave(pa_lock, flags);
24762         __count_vm_events(PGFREE, 1 << order);
24763         free_one_page(page_zone(page), page, pfn, order, migratetype);
24764 -       local_irq_restore(flags);
24765 +       local_unlock_irqrestore(pa_lock, flags);
24766  }
24767
24768  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
24769 @@ -2246,16 +2285,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
24770  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
24771  {
24772         unsigned long flags;
24773 +       LIST_HEAD(dst);
24774         int to_drain, batch;
24775
24776 -       local_irq_save(flags);
24777 +       local_lock_irqsave(pa_lock, flags);
24778         batch = READ_ONCE(pcp->batch);
24779         to_drain = min(pcp->count, batch);
24780         if (to_drain > 0) {
24781 -               free_pcppages_bulk(zone, to_drain, pcp);
24782 +               isolate_pcp_pages(to_drain, pcp, &dst);
24783                 pcp->count -= to_drain;
24784         }
24785 -       local_irq_restore(flags);
24786 +       local_unlock_irqrestore(pa_lock, flags);
24787 +       free_pcppages_bulk(zone, to_drain, &dst);
24788  }
24789  #endif
24790
24791 @@ -2271,16 +2312,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
24792         unsigned long flags;
24793         struct per_cpu_pageset *pset;
24794         struct per_cpu_pages *pcp;
24795 +       LIST_HEAD(dst);
24796 +       int count;
24797
24798 -       local_irq_save(flags);
24799 +       cpu_lock_irqsave(cpu, flags);
24800         pset = per_cpu_ptr(zone->pageset, cpu);
24801
24802         pcp = &pset->pcp;
24803 -       if (pcp->count) {
24804 -               free_pcppages_bulk(zone, pcp->count, pcp);
24805 +       count = pcp->count;
24806 +       if (count) {
24807 +               isolate_pcp_pages(count, pcp, &dst);
24808                 pcp->count = 0;
24809         }
24810 -       local_irq_restore(flags);
24811 +       cpu_unlock_irqrestore(cpu, flags);
24812 +       if (count)
24813 +               free_pcppages_bulk(zone, count, &dst);
24814  }
24815
24816  /*
24817 @@ -2366,8 +2412,17 @@ void drain_all_pages(struct zone *zone)
24818                 else
24819                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
24820         }
24821 +#ifndef CONFIG_PREEMPT_RT_BASE
24822         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
24823                                                                 zone, 1);
24824 +#else
24825 +       for_each_cpu(cpu, &cpus_with_pcps) {
24826 +               if (zone)
24827 +                       drain_pages_zone(cpu, zone);
24828 +               else
24829 +                       drain_pages(cpu);
24830 +       }
24831 +#endif
24832  }
24833
24834  #ifdef CONFIG_HIBERNATION
24835 @@ -2427,7 +2482,7 @@ void free_hot_cold_page(struct page *page, bool cold)
24836
24837         migratetype = get_pfnblock_migratetype(page, pfn);
24838         set_pcppage_migratetype(page, migratetype);
24839 -       local_irq_save(flags);
24840 +       local_lock_irqsave(pa_lock, flags);
24841         __count_vm_event(PGFREE);
24842
24843         /*
24844 @@ -2453,12 +2508,17 @@ void free_hot_cold_page(struct page *page, bool cold)
24845         pcp->count++;
24846         if (pcp->count >= pcp->high) {
24847                 unsigned long batch = READ_ONCE(pcp->batch);
24848 -               free_pcppages_bulk(zone, batch, pcp);
24849 +               LIST_HEAD(dst);
24850 +
24851 +               isolate_pcp_pages(batch, pcp, &dst);
24852                 pcp->count -= batch;
24853 +               local_unlock_irqrestore(pa_lock, flags);
24854 +               free_pcppages_bulk(zone, batch, &dst);
24855 +               return;
24856         }
24857
24858  out:
24859 -       local_irq_restore(flags);
24860 +       local_unlock_irqrestore(pa_lock, flags);
24861  }
24862
24863  /*
24864 @@ -2600,7 +2660,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
24865                 struct per_cpu_pages *pcp;
24866                 struct list_head *list;
24867
24868 -               local_irq_save(flags);
24869 +               local_lock_irqsave(pa_lock, flags);
24870                 do {
24871                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
24872                         list = &pcp->lists[migratetype];
24873 @@ -2627,7 +2687,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
24874                  * allocate greater than order-1 page units with __GFP_NOFAIL.
24875                  */
24876                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
24877 -               spin_lock_irqsave(&zone->lock, flags);
24878 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
24879
24880                 do {
24881                         page = NULL;
24882 @@ -2639,22 +2699,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
24883                         if (!page)
24884                                 page = __rmqueue(zone, order, migratetype);
24885                 } while (page && check_new_pages(page, order));
24886 -               spin_unlock(&zone->lock);
24887 -               if (!page)
24888 +               if (!page) {
24889 +                       spin_unlock(&zone->lock);
24890                         goto failed;
24891 +               }
24892                 __mod_zone_freepage_state(zone, -(1 << order),
24893                                           get_pcppage_migratetype(page));
24894 +               spin_unlock(&zone->lock);
24895         }
24896
24897         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
24898         zone_statistics(preferred_zone, zone, gfp_flags);
24899 -       local_irq_restore(flags);
24900 +       local_unlock_irqrestore(pa_lock, flags);
24901
24902         VM_BUG_ON_PAGE(bad_range(zone, page), page);
24903         return page;
24904
24905  failed:
24906 -       local_irq_restore(flags);
24907 +       local_unlock_irqrestore(pa_lock, flags);
24908         return NULL;
24909  }
24910
24911 @@ -6531,7 +6593,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
24912         int cpu = (unsigned long)hcpu;
24913
24914         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
24915 +               local_lock_irq_on(swapvec_lock, cpu);
24916                 lru_add_drain_cpu(cpu);
24917 +               local_unlock_irq_on(swapvec_lock, cpu);
24918                 drain_pages(cpu);
24919
24920                 /*
24921 @@ -6557,6 +6621,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
24922  void __init page_alloc_init(void)
24923  {
24924         hotcpu_notifier(page_alloc_cpu_notify, 0);
24925 +       local_irq_lock_init(pa_lock);
24926  }
24927
24928  /*
24929 @@ -7385,7 +7450,7 @@ void zone_pcp_reset(struct zone *zone)
24930         struct per_cpu_pageset *pset;
24931
24932         /* avoid races with drain_pages()  */
24933 -       local_irq_save(flags);
24934 +       local_lock_irqsave(pa_lock, flags);
24935         if (zone->pageset != &boot_pageset) {
24936                 for_each_online_cpu(cpu) {
24937                         pset = per_cpu_ptr(zone->pageset, cpu);
24938 @@ -7394,7 +7459,7 @@ void zone_pcp_reset(struct zone *zone)
24939                 free_percpu(zone->pageset);
24940                 zone->pageset = &boot_pageset;
24941         }
24942 -       local_irq_restore(flags);
24943 +       local_unlock_irqrestore(pa_lock, flags);
24944  }
24945
24946  #ifdef CONFIG_MEMORY_HOTREMOVE
24947 diff --git a/mm/percpu.c b/mm/percpu.c
24948 index f014cebbf405..4e739fcf91bf 100644
24949 --- a/mm/percpu.c
24950 +++ b/mm/percpu.c
24951 @@ -1283,6 +1283,31 @@ void free_percpu(void __percpu *ptr)
24952  }
24953  EXPORT_SYMBOL_GPL(free_percpu);
24954
24955 +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
24956 +{
24957 +#ifdef CONFIG_SMP
24958 +       const size_t static_size = __per_cpu_end - __per_cpu_start;
24959 +       void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
24960 +       unsigned int cpu;
24961 +
24962 +       for_each_possible_cpu(cpu) {
24963 +               void *start = per_cpu_ptr(base, cpu);
24964 +               void *va = (void *)addr;
24965 +
24966 +               if (va >= start && va < start + static_size) {
24967 +                       if (can_addr) {
24968 +                               *can_addr = (unsigned long) (va - start);
24969 +                               *can_addr += (unsigned long)
24970 +                                       per_cpu_ptr(base, get_boot_cpu_id());
24971 +                       }
24972 +                       return true;
24973 +               }
24974 +       }
24975 +#endif
24976 +       /* on UP, can't distinguish from other static vars, always false */
24977 +       return false;
24978 +}
24979 +
24980  /**
24981   * is_kernel_percpu_address - test whether address is from static percpu area
24982   * @addr: address to test
24983 @@ -1296,20 +1321,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
24984   */
24985  bool is_kernel_percpu_address(unsigned long addr)
24986  {
24987 -#ifdef CONFIG_SMP
24988 -       const size_t static_size = __per_cpu_end - __per_cpu_start;
24989 -       void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
24990 -       unsigned int cpu;
24991 -
24992 -       for_each_possible_cpu(cpu) {
24993 -               void *start = per_cpu_ptr(base, cpu);
24994 -
24995 -               if ((void *)addr >= start && (void *)addr < start + static_size)
24996 -                       return true;
24997 -        }
24998 -#endif
24999 -       /* on UP, can't distinguish from other static vars, always false */
25000 -       return false;
25001 +       return __is_kernel_percpu_address(addr, NULL);
25002  }
25003
25004  /**
25005 diff --git a/mm/slab.h b/mm/slab.h
25006 index ceb7d70cdb76..dfd281e43fbe 100644
25007 --- a/mm/slab.h
25008 +++ b/mm/slab.h
25009 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
25010   * The slab lists for all objects.
25011   */
25012  struct kmem_cache_node {
25013 +#ifdef CONFIG_SLUB
25014 +       raw_spinlock_t list_lock;
25015 +#else
25016         spinlock_t list_lock;
25017 +#endif
25018
25019  #ifdef CONFIG_SLAB
25020         struct list_head slabs_partial; /* partial list first, better asm code */
25021 diff --git a/mm/slub.c b/mm/slub.c
25022 index 58c7526f8de2..6d72b7f87129 100644
25023 --- a/mm/slub.c
25024 +++ b/mm/slub.c
25025 @@ -1141,7 +1141,7 @@ static noinline int free_debug_processing(
25026         unsigned long uninitialized_var(flags);
25027         int ret = 0;
25028
25029 -       spin_lock_irqsave(&n->list_lock, flags);
25030 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25031         slab_lock(page);
25032
25033         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
25034 @@ -1176,7 +1176,7 @@ static noinline int free_debug_processing(
25035                          bulk_cnt, cnt);
25036
25037         slab_unlock(page);
25038 -       spin_unlock_irqrestore(&n->list_lock, flags);
25039 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25040         if (!ret)
25041                 slab_fix(s, "Object at 0x%p not freed", object);
25042         return ret;
25043 @@ -1304,6 +1304,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
25044
25045  #endif /* CONFIG_SLUB_DEBUG */
25046
25047 +struct slub_free_list {
25048 +       raw_spinlock_t          lock;
25049 +       struct list_head        list;
25050 +};
25051 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
25052 +
25053  /*
25054   * Hooks for other subsystems that check memory allocations. In a typical
25055   * production configuration these hooks all should produce no code at all.
25056 @@ -1527,10 +1533,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
25057         void *start, *p;
25058         int idx, order;
25059         bool shuffle;
25060 +       bool enableirqs = false;
25061
25062         flags &= gfp_allowed_mask;
25063
25064         if (gfpflags_allow_blocking(flags))
25065 +               enableirqs = true;
25066 +#ifdef CONFIG_PREEMPT_RT_FULL
25067 +       if (system_state == SYSTEM_RUNNING)
25068 +               enableirqs = true;
25069 +#endif
25070 +       if (enableirqs)
25071                 local_irq_enable();
25072
25073         flags |= s->allocflags;
25074 @@ -1605,7 +1618,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
25075         page->frozen = 1;
25076
25077  out:
25078 -       if (gfpflags_allow_blocking(flags))
25079 +       if (enableirqs)
25080                 local_irq_disable();
25081         if (!page)
25082                 return NULL;
25083 @@ -1664,6 +1677,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
25084         __free_pages(page, order);
25085  }
25086
25087 +static void free_delayed(struct list_head *h)
25088 +{
25089 +       while(!list_empty(h)) {
25090 +               struct page *page = list_first_entry(h, struct page, lru);
25091 +
25092 +               list_del(&page->lru);
25093 +               __free_slab(page->slab_cache, page);
25094 +       }
25095 +}
25096 +
25097  #define need_reserve_slab_rcu                                          \
25098         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
25099
25100 @@ -1695,6 +1718,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
25101                 }
25102
25103                 call_rcu(head, rcu_free_slab);
25104 +       } else if (irqs_disabled()) {
25105 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
25106 +
25107 +               raw_spin_lock(&f->lock);
25108 +               list_add(&page->lru, &f->list);
25109 +               raw_spin_unlock(&f->lock);
25110         } else
25111                 __free_slab(s, page);
25112  }
25113 @@ -1802,7 +1831,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
25114         if (!n || !n->nr_partial)
25115                 return NULL;
25116
25117 -       spin_lock(&n->list_lock);
25118 +       raw_spin_lock(&n->list_lock);
25119         list_for_each_entry_safe(page, page2, &n->partial, lru) {
25120                 void *t;
25121
25122 @@ -1827,7 +1856,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
25123                         break;
25124
25125         }
25126 -       spin_unlock(&n->list_lock);
25127 +       raw_spin_unlock(&n->list_lock);
25128         return object;
25129  }
25130
25131 @@ -2073,7 +2102,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25132                          * that acquire_slab() will see a slab page that
25133                          * is frozen
25134                          */
25135 -                       spin_lock(&n->list_lock);
25136 +                       raw_spin_lock(&n->list_lock);
25137                 }
25138         } else {
25139                 m = M_FULL;
25140 @@ -2084,7 +2113,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25141                          * slabs from diagnostic functions will not see
25142                          * any frozen slabs.
25143                          */
25144 -                       spin_lock(&n->list_lock);
25145 +                       raw_spin_lock(&n->list_lock);
25146                 }
25147         }
25148
25149 @@ -2119,7 +2148,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
25150                 goto redo;
25151
25152         if (lock)
25153 -               spin_unlock(&n->list_lock);
25154 +               raw_spin_unlock(&n->list_lock);
25155
25156         if (m == M_FREE) {
25157                 stat(s, DEACTIVATE_EMPTY);
25158 @@ -2151,10 +2180,10 @@ static void unfreeze_partials(struct kmem_cache *s,
25159                 n2 = get_node(s, page_to_nid(page));
25160                 if (n != n2) {
25161                         if (n)
25162 -                               spin_unlock(&n->list_lock);
25163 +                               raw_spin_unlock(&n->list_lock);
25164
25165                         n = n2;
25166 -                       spin_lock(&n->list_lock);
25167 +                       raw_spin_lock(&n->list_lock);
25168                 }
25169
25170                 do {
25171 @@ -2183,7 +2212,7 @@ static void unfreeze_partials(struct kmem_cache *s,
25172         }
25173
25174         if (n)
25175 -               spin_unlock(&n->list_lock);
25176 +               raw_spin_unlock(&n->list_lock);
25177
25178         while (discard_page) {
25179                 page = discard_page;
25180 @@ -2222,14 +2251,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
25181                         pobjects = oldpage->pobjects;
25182                         pages = oldpage->pages;
25183                         if (drain && pobjects > s->cpu_partial) {
25184 +                               struct slub_free_list *f;
25185                                 unsigned long flags;
25186 +                               LIST_HEAD(tofree);
25187                                 /*
25188                                  * partial array is full. Move the existing
25189                                  * set to the per node partial list.
25190                                  */
25191                                 local_irq_save(flags);
25192                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
25193 +                               f = this_cpu_ptr(&slub_free_list);
25194 +                               raw_spin_lock(&f->lock);
25195 +                               list_splice_init(&f->list, &tofree);
25196 +                               raw_spin_unlock(&f->lock);
25197                                 local_irq_restore(flags);
25198 +                               free_delayed(&tofree);
25199                                 oldpage = NULL;
25200                                 pobjects = 0;
25201                                 pages = 0;
25202 @@ -2301,7 +2337,22 @@ static bool has_cpu_slab(int cpu, void *info)
25203
25204  static void flush_all(struct kmem_cache *s)
25205  {
25206 +       LIST_HEAD(tofree);
25207 +       int cpu;
25208 +
25209         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
25210 +       for_each_online_cpu(cpu) {
25211 +               struct slub_free_list *f;
25212 +
25213 +               if (!has_cpu_slab(cpu, s))
25214 +                       continue;
25215 +
25216 +               f = &per_cpu(slub_free_list, cpu);
25217 +               raw_spin_lock_irq(&f->lock);
25218 +               list_splice_init(&f->list, &tofree);
25219 +               raw_spin_unlock_irq(&f->lock);
25220 +               free_delayed(&tofree);
25221 +       }
25222  }
25223
25224  /*
25225 @@ -2356,10 +2407,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
25226         unsigned long x = 0;
25227         struct page *page;
25228
25229 -       spin_lock_irqsave(&n->list_lock, flags);
25230 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25231         list_for_each_entry(page, &n->partial, lru)
25232                 x += get_count(page);
25233 -       spin_unlock_irqrestore(&n->list_lock, flags);
25234 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25235         return x;
25236  }
25237  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
25238 @@ -2497,8 +2548,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
25239   * already disabled (which is the case for bulk allocation).
25240   */
25241  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25242 -                         unsigned long addr, struct kmem_cache_cpu *c)
25243 +                         unsigned long addr, struct kmem_cache_cpu *c,
25244 +                         struct list_head *to_free)
25245  {
25246 +       struct slub_free_list *f;
25247         void *freelist;
25248         struct page *page;
25249
25250 @@ -2558,6 +2611,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25251         VM_BUG_ON(!c->page->frozen);
25252         c->freelist = get_freepointer(s, freelist);
25253         c->tid = next_tid(c->tid);
25254 +
25255 +out:
25256 +       f = this_cpu_ptr(&slub_free_list);
25257 +       raw_spin_lock(&f->lock);
25258 +       list_splice_init(&f->list, to_free);
25259 +       raw_spin_unlock(&f->lock);
25260 +
25261         return freelist;
25262
25263  new_slab:
25264 @@ -2589,7 +2649,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25265         deactivate_slab(s, page, get_freepointer(s, freelist));
25266         c->page = NULL;
25267         c->freelist = NULL;
25268 -       return freelist;
25269 +       goto out;
25270  }
25271
25272  /*
25273 @@ -2601,6 +2661,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25274  {
25275         void *p;
25276         unsigned long flags;
25277 +       LIST_HEAD(tofree);
25278
25279         local_irq_save(flags);
25280  #ifdef CONFIG_PREEMPT
25281 @@ -2612,8 +2673,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
25282         c = this_cpu_ptr(s->cpu_slab);
25283  #endif
25284
25285 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
25286 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
25287         local_irq_restore(flags);
25288 +       free_delayed(&tofree);
25289         return p;
25290  }
25291
25292 @@ -2799,7 +2861,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25293
25294         do {
25295                 if (unlikely(n)) {
25296 -                       spin_unlock_irqrestore(&n->list_lock, flags);
25297 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25298                         n = NULL;
25299                 }
25300                 prior = page->freelist;
25301 @@ -2831,7 +2893,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25302                                  * Otherwise the list_lock will synchronize with
25303                                  * other processors updating the list of slabs.
25304                                  */
25305 -                               spin_lock_irqsave(&n->list_lock, flags);
25306 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
25307
25308                         }
25309                 }
25310 @@ -2873,7 +2935,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25311                 add_partial(n, page, DEACTIVATE_TO_TAIL);
25312                 stat(s, FREE_ADD_PARTIAL);
25313         }
25314 -       spin_unlock_irqrestore(&n->list_lock, flags);
25315 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25316         return;
25317
25318  slab_empty:
25319 @@ -2888,7 +2950,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
25320                 remove_full(s, n, page);
25321         }
25322
25323 -       spin_unlock_irqrestore(&n->list_lock, flags);
25324 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25325         stat(s, FREE_SLAB);
25326         discard_slab(s, page);
25327  }
25328 @@ -3093,6 +3155,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25329                           void **p)
25330  {
25331         struct kmem_cache_cpu *c;
25332 +       LIST_HEAD(to_free);
25333         int i;
25334
25335         /* memcg and kmem_cache debug support */
25336 @@ -3116,7 +3179,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25337                          * of re-populating per CPU c->freelist
25338                          */
25339                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
25340 -                                           _RET_IP_, c);
25341 +                                           _RET_IP_, c, &to_free);
25342                         if (unlikely(!p[i]))
25343                                 goto error;
25344
25345 @@ -3128,6 +3191,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
25346         }
25347         c->tid = next_tid(c->tid);
25348         local_irq_enable();
25349 +       free_delayed(&to_free);
25350
25351         /* Clear memory outside IRQ disabled fastpath loop */
25352         if (unlikely(flags & __GFP_ZERO)) {
25353 @@ -3275,7 +3339,7 @@ static void
25354  init_kmem_cache_node(struct kmem_cache_node *n)
25355  {
25356         n->nr_partial = 0;
25357 -       spin_lock_init(&n->list_lock);
25358 +       raw_spin_lock_init(&n->list_lock);
25359         INIT_LIST_HEAD(&n->partial);
25360  #ifdef CONFIG_SLUB_DEBUG
25361         atomic_long_set(&n->nr_slabs, 0);
25362 @@ -3619,6 +3683,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
25363                                                         const char *text)
25364  {
25365  #ifdef CONFIG_SLUB_DEBUG
25366 +#ifdef CONFIG_PREEMPT_RT_BASE
25367 +       /* XXX move out of irq-off section */
25368 +       slab_err(s, page, text, s->name);
25369 +#else
25370         void *addr = page_address(page);
25371         void *p;
25372         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
25373 @@ -3639,6 +3707,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
25374         slab_unlock(page);
25375         kfree(map);
25376  #endif
25377 +#endif
25378  }
25379
25380  /*
25381 @@ -3652,7 +3721,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
25382         struct page *page, *h;
25383
25384         BUG_ON(irqs_disabled());
25385 -       spin_lock_irq(&n->list_lock);
25386 +       raw_spin_lock_irq(&n->list_lock);
25387         list_for_each_entry_safe(page, h, &n->partial, lru) {
25388                 if (!page->inuse) {
25389                         remove_partial(n, page);
25390 @@ -3662,7 +3731,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
25391                         "Objects remaining in %s on __kmem_cache_shutdown()");
25392                 }
25393         }
25394 -       spin_unlock_irq(&n->list_lock);
25395 +       raw_spin_unlock_irq(&n->list_lock);
25396
25397         list_for_each_entry_safe(page, h, &discard, lru)
25398                 discard_slab(s, page);
25399 @@ -3905,7 +3974,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
25400                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
25401                         INIT_LIST_HEAD(promote + i);
25402
25403 -               spin_lock_irqsave(&n->list_lock, flags);
25404 +               raw_spin_lock_irqsave(&n->list_lock, flags);
25405
25406                 /*
25407                  * Build lists of slabs to discard or promote.
25408 @@ -3936,7 +4005,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
25409                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
25410                         list_splice(promote + i, &n->partial);
25411
25412 -               spin_unlock_irqrestore(&n->list_lock, flags);
25413 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
25414
25415                 /* Release empty slabs */
25416                 list_for_each_entry_safe(page, t, &discard, lru)
25417 @@ -4112,6 +4181,12 @@ void __init kmem_cache_init(void)
25418  {
25419         static __initdata struct kmem_cache boot_kmem_cache,
25420                 boot_kmem_cache_node;
25421 +       int cpu;
25422 +
25423 +       for_each_possible_cpu(cpu) {
25424 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
25425 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
25426 +       }
25427
25428         if (debug_guardpage_minorder())
25429                 slub_max_order = 0;
25430 @@ -4320,7 +4395,7 @@ static int validate_slab_node(struct kmem_cache *s,
25431         struct page *page;
25432         unsigned long flags;
25433
25434 -       spin_lock_irqsave(&n->list_lock, flags);
25435 +       raw_spin_lock_irqsave(&n->list_lock, flags);
25436
25437         list_for_each_entry(page, &n->partial, lru) {
25438                 validate_slab_slab(s, page, map);
25439 @@ -4342,7 +4417,7 @@ static int validate_slab_node(struct kmem_cache *s,
25440                        s->name, count, atomic_long_read(&n->nr_slabs));
25441
25442  out:
25443 -       spin_unlock_irqrestore(&n->list_lock, flags);
25444 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
25445         return count;
25446  }
25447
25448 @@ -4530,12 +4605,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
25449                 if (!atomic_long_read(&n->nr_slabs))
25450                         continue;
25451
25452 -               spin_lock_irqsave(&n->list_lock, flags);
25453 +               raw_spin_lock_irqsave(&n->list_lock, flags);
25454                 list_for_each_entry(page, &n->partial, lru)
25455                         process_slab(&t, s, page, alloc, map);
25456                 list_for_each_entry(page, &n->full, lru)
25457                         process_slab(&t, s, page, alloc, map);
25458 -               spin_unlock_irqrestore(&n->list_lock, flags);
25459 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
25460         }
25461
25462         for (i = 0; i < t.count; i++) {
25463 diff --git a/mm/swap.c b/mm/swap.c
25464 index 4dcf852e1e6d..69c3a5b24060 100644
25465 --- a/mm/swap.c
25466 +++ b/mm/swap.c
25467 @@ -32,6 +32,7 @@
25468  #include <linux/memcontrol.h>
25469  #include <linux/gfp.h>
25470  #include <linux/uio.h>
25471 +#include <linux/locallock.h>
25472  #include <linux/hugetlb.h>
25473  #include <linux/page_idle.h>
25474
25475 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
25476  #ifdef CONFIG_SMP
25477  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
25478  #endif
25479 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
25480 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
25481
25482  /*
25483   * This path almost never happens for VM activity - pages are normally
25484 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
25485                 unsigned long flags;
25486
25487                 get_page(page);
25488 -               local_irq_save(flags);
25489 +               local_lock_irqsave(rotate_lock, flags);
25490                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
25491                 if (!pagevec_add(pvec, page) || PageCompound(page))
25492                         pagevec_move_tail(pvec);
25493 -               local_irq_restore(flags);
25494 +               local_unlock_irqrestore(rotate_lock, flags);
25495         }
25496  }
25497
25498 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
25499  {
25500         page = compound_head(page);
25501         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
25502 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
25503 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
25504 +                                                      activate_page_pvecs);
25505
25506                 get_page(page);
25507                 if (!pagevec_add(pvec, page) || PageCompound(page))
25508                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
25509 -               put_cpu_var(activate_page_pvecs);
25510 +               put_locked_var(swapvec_lock, activate_page_pvecs);
25511         }
25512  }
25513
25514 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
25515
25516  static void __lru_cache_activate_page(struct page *page)
25517  {
25518 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
25519 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
25520         int i;
25521
25522         /*
25523 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
25524                 }
25525         }
25526
25527 -       put_cpu_var(lru_add_pvec);
25528 +       put_locked_var(swapvec_lock, lru_add_pvec);
25529  }
25530
25531  /*
25532 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
25533
25534  static void __lru_cache_add(struct page *page)
25535  {
25536 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
25537 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
25538
25539         get_page(page);
25540         if (!pagevec_add(pvec, page) || PageCompound(page))
25541                 __pagevec_lru_add(pvec);
25542 -       put_cpu_var(lru_add_pvec);
25543 +       put_locked_var(swapvec_lock, lru_add_pvec);
25544  }
25545
25546  /**
25547 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
25548                 unsigned long flags;
25549
25550                 /* No harm done if a racing interrupt already did this */
25551 -               local_irq_save(flags);
25552 +#ifdef CONFIG_PREEMPT_RT_BASE
25553 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
25554                 pagevec_move_tail(pvec);
25555 -               local_irq_restore(flags);
25556 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
25557 +#else
25558 +               local_lock_irqsave(rotate_lock, flags);
25559 +               pagevec_move_tail(pvec);
25560 +               local_unlock_irqrestore(rotate_lock, flags);
25561 +#endif
25562         }
25563
25564         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
25565 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
25566                 return;
25567
25568         if (likely(get_page_unless_zero(page))) {
25569 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
25570 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
25571 +                                                      lru_deactivate_file_pvecs);
25572
25573                 if (!pagevec_add(pvec, page) || PageCompound(page))
25574                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
25575 -               put_cpu_var(lru_deactivate_file_pvecs);
25576 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
25577         }
25578  }
25579
25580 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
25581  void deactivate_page(struct page *page)
25582  {
25583         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
25584 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
25585 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
25586 +                                                      lru_deactivate_pvecs);
25587
25588                 get_page(page);
25589                 if (!pagevec_add(pvec, page) || PageCompound(page))
25590                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
25591 -               put_cpu_var(lru_deactivate_pvecs);
25592 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
25593         }
25594  }
25595
25596  void lru_add_drain(void)
25597  {
25598 -       lru_add_drain_cpu(get_cpu());
25599 -       put_cpu();
25600 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
25601 +       local_unlock_cpu(swapvec_lock);
25602  }
25603
25604 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
25605 +#ifdef CONFIG_PREEMPT_RT_BASE
25606 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
25607  {
25608 -       lru_add_drain();
25609 +       local_lock_on(swapvec_lock, cpu);
25610 +       lru_add_drain_cpu(cpu);
25611 +       local_unlock_on(swapvec_lock, cpu);
25612  }
25613
25614 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
25615 +#else
25616
25617  /*
25618   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
25619 @@ -686,6 +701,22 @@ static int __init lru_init(void)
25620  }
25621  early_initcall(lru_init);
25622
25623 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
25624 +{
25625 +       lru_add_drain();
25626 +}
25627 +
25628 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
25629 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
25630 +{
25631 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
25632 +
25633 +       INIT_WORK(work, lru_add_drain_per_cpu);
25634 +       queue_work_on(cpu, lru_add_drain_wq, work);
25635 +       cpumask_set_cpu(cpu, has_work);
25636 +}
25637 +#endif
25638 +
25639  void lru_add_drain_all(void)
25640  {
25641         static DEFINE_MUTEX(lock);
25642 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
25643         cpumask_clear(&has_work);
25644
25645         for_each_online_cpu(cpu) {
25646 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
25647 -
25648                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
25649                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
25650                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
25651                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
25652 -                   need_activate_page_drain(cpu)) {
25653 -                       INIT_WORK(work, lru_add_drain_per_cpu);
25654 -                       queue_work_on(cpu, lru_add_drain_wq, work);
25655 -                       cpumask_set_cpu(cpu, &has_work);
25656 -               }
25657 +                   need_activate_page_drain(cpu))
25658 +                       remote_lru_add_drain(cpu, &has_work);
25659         }
25660
25661 +#ifndef CONFIG_PREEMPT_RT_BASE
25662         for_each_cpu(cpu, &has_work)
25663                 flush_work(&per_cpu(lru_add_drain_work, cpu));
25664 +#endif
25665
25666         put_online_cpus();
25667         mutex_unlock(&lock);
25668 diff --git a/mm/truncate.c b/mm/truncate.c
25669 index 8d8c62d89e6d..5bf1bd25d077 100644
25670 --- a/mm/truncate.c
25671 +++ b/mm/truncate.c
25672 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
25673          * protected by mapping->tree_lock.
25674          */
25675         if (!workingset_node_shadows(node) &&
25676 -           !list_empty(&node->private_list))
25677 -               list_lru_del(&workingset_shadow_nodes,
25678 +           !list_empty(&node->private_list)) {
25679 +               local_lock(workingset_shadow_lock);
25680 +               list_lru_del(&__workingset_shadow_nodes,
25681                                 &node->private_list);
25682 +               local_unlock(workingset_shadow_lock);
25683 +       }
25684         __radix_tree_delete_node(&mapping->page_tree, node);
25685  unlock:
25686         spin_unlock_irq(&mapping->tree_lock);
25687 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
25688 index f2481cb4e6b2..db4de08fa97c 100644
25689 --- a/mm/vmalloc.c
25690 +++ b/mm/vmalloc.c
25691 @@ -845,7 +845,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
25692         struct vmap_block *vb;
25693         struct vmap_area *va;
25694         unsigned long vb_idx;
25695 -       int node, err;
25696 +       int node, err, cpu;
25697         void *vaddr;
25698
25699         node = numa_node_id();
25700 @@ -888,11 +888,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
25701         BUG_ON(err);
25702         radix_tree_preload_end();
25703
25704 -       vbq = &get_cpu_var(vmap_block_queue);
25705 +       cpu = get_cpu_light();
25706 +       vbq = this_cpu_ptr(&vmap_block_queue);
25707         spin_lock(&vbq->lock);
25708         list_add_tail_rcu(&vb->free_list, &vbq->free);
25709         spin_unlock(&vbq->lock);
25710 -       put_cpu_var(vmap_block_queue);
25711 +       put_cpu_light();
25712
25713         return vaddr;
25714  }
25715 @@ -961,6 +962,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
25716         struct vmap_block *vb;
25717         void *vaddr = NULL;
25718         unsigned int order;
25719 +       int cpu;
25720
25721         BUG_ON(offset_in_page(size));
25722         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
25723 @@ -975,7 +977,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
25724         order = get_order(size);
25725
25726         rcu_read_lock();
25727 -       vbq = &get_cpu_var(vmap_block_queue);
25728 +       cpu = get_cpu_light();
25729 +       vbq = this_cpu_ptr(&vmap_block_queue);
25730         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
25731                 unsigned long pages_off;
25732
25733 @@ -998,7 +1001,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
25734                 break;
25735         }
25736
25737 -       put_cpu_var(vmap_block_queue);
25738 +       put_cpu_light();
25739         rcu_read_unlock();
25740
25741         /* Allocate new block if nothing was found */
25742 diff --git a/mm/vmstat.c b/mm/vmstat.c
25743 index 604f26a4f696..312006d2db50 100644
25744 --- a/mm/vmstat.c
25745 +++ b/mm/vmstat.c
25746 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
25747         long x;
25748         long t;
25749
25750 +       preempt_disable_rt();
25751         x = delta + __this_cpu_read(*p);
25752
25753         t = __this_cpu_read(pcp->stat_threshold);
25754 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
25755                 x = 0;
25756         }
25757         __this_cpu_write(*p, x);
25758 +       preempt_enable_rt();
25759  }
25760  EXPORT_SYMBOL(__mod_zone_page_state);
25761
25762 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
25763         long x;
25764         long t;
25765
25766 +       preempt_disable_rt();
25767         x = delta + __this_cpu_read(*p);
25768
25769         t = __this_cpu_read(pcp->stat_threshold);
25770 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
25771                 x = 0;
25772         }
25773         __this_cpu_write(*p, x);
25774 +       preempt_enable_rt();
25775  }
25776  EXPORT_SYMBOL(__mod_node_page_state);
25777
25778 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
25779         s8 __percpu *p = pcp->vm_stat_diff + item;
25780         s8 v, t;
25781
25782 +       preempt_disable_rt();
25783         v = __this_cpu_inc_return(*p);
25784         t = __this_cpu_read(pcp->stat_threshold);
25785         if (unlikely(v > t)) {
25786 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
25787                 zone_page_state_add(v + overstep, zone, item);
25788                 __this_cpu_write(*p, -overstep);
25789         }
25790 +       preempt_enable_rt();
25791  }
25792
25793  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25794 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25795         s8 __percpu *p = pcp->vm_node_stat_diff + item;
25796         s8 v, t;
25797
25798 +       preempt_disable_rt();
25799         v = __this_cpu_inc_return(*p);
25800         t = __this_cpu_read(pcp->stat_threshold);
25801         if (unlikely(v > t)) {
25802 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25803                 node_page_state_add(v + overstep, pgdat, item);
25804                 __this_cpu_write(*p, -overstep);
25805         }
25806 +       preempt_enable_rt();
25807  }
25808
25809  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
25810 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
25811         s8 __percpu *p = pcp->vm_stat_diff + item;
25812         s8 v, t;
25813
25814 +       preempt_disable_rt();
25815         v = __this_cpu_dec_return(*p);
25816         t = __this_cpu_read(pcp->stat_threshold);
25817         if (unlikely(v < - t)) {
25818 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
25819                 zone_page_state_add(v - overstep, zone, item);
25820                 __this_cpu_write(*p, overstep);
25821         }
25822 +       preempt_enable_rt();
25823  }
25824
25825  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25826 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25827         s8 __percpu *p = pcp->vm_node_stat_diff + item;
25828         s8 v, t;
25829
25830 +       preempt_disable_rt();
25831         v = __this_cpu_dec_return(*p);
25832         t = __this_cpu_read(pcp->stat_threshold);
25833         if (unlikely(v < - t)) {
25834 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
25835                 node_page_state_add(v - overstep, pgdat, item);
25836                 __this_cpu_write(*p, overstep);
25837         }
25838 +       preempt_enable_rt();
25839  }
25840
25841  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
25842 diff --git a/mm/workingset.c b/mm/workingset.c
25843 index 4c4f05655e6e..b97b1e87b54c 100644
25844 --- a/mm/workingset.c
25845 +++ b/mm/workingset.c
25846 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
25847   * point where they would still be useful.
25848   */
25849
25850 -struct list_lru workingset_shadow_nodes;
25851 +struct list_lru __workingset_shadow_nodes;
25852 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
25853
25854  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
25855                                         struct shrink_control *sc)
25856 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
25857         unsigned long pages;
25858
25859         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
25860 -       local_irq_disable();
25861 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
25862 -       local_irq_enable();
25863 +       local_lock_irq(workingset_shadow_lock);
25864 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
25865 +       local_unlock_irq(workingset_shadow_lock);
25866
25867         if (sc->memcg) {
25868                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
25869 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
25870         spin_unlock(&mapping->tree_lock);
25871         ret = LRU_REMOVED_RETRY;
25872  out:
25873 -       local_irq_enable();
25874 +       local_unlock_irq(workingset_shadow_lock);
25875         cond_resched();
25876 -       local_irq_disable();
25877 +       local_lock_irq(workingset_shadow_lock);
25878         spin_lock(lru_lock);
25879         return ret;
25880  }
25881 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
25882         unsigned long ret;
25883
25884         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
25885 -       local_irq_disable();
25886 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
25887 +       local_lock_irq(workingset_shadow_lock);
25888 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
25889                                     shadow_lru_isolate, NULL);
25890 -       local_irq_enable();
25891 +       local_unlock_irq(workingset_shadow_lock);
25892         return ret;
25893  }
25894
25895 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
25896         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
25897                timestamp_bits, max_order, bucket_order);
25898
25899 -       ret = __list_lru_init(&workingset_shadow_nodes, true, &shadow_nodes_key);
25900 +       ret = __list_lru_init(&__workingset_shadow_nodes, true, &shadow_nodes_key);
25901         if (ret)
25902                 goto err;
25903         ret = register_shrinker(&workingset_shadow_shrinker);
25904 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
25905                 goto err_list_lru;
25906         return 0;
25907  err_list_lru:
25908 -       list_lru_destroy(&workingset_shadow_nodes);
25909 +       list_lru_destroy(&__workingset_shadow_nodes);
25910  err:
25911         return ret;
25912  }
25913 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
25914 index 1689bb58e0d1..e52a8cb6aa5a 100644
25915 --- a/mm/zsmalloc.c
25916 +++ b/mm/zsmalloc.c
25917 @@ -53,6 +53,7 @@
25918  #include <linux/mount.h>
25919  #include <linux/migrate.h>
25920  #include <linux/pagemap.h>
25921 +#include <linux/locallock.h>
25922
25923  #define ZSPAGE_MAGIC   0x58
25924
25925 @@ -70,9 +71,22 @@
25926   */
25927  #define ZS_MAX_ZSPAGE_ORDER 2
25928  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
25929 -
25930  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
25931
25932 +#ifdef CONFIG_PREEMPT_RT_FULL
25933 +
25934 +struct zsmalloc_handle {
25935 +       unsigned long addr;
25936 +       struct mutex lock;
25937 +};
25938 +
25939 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
25940 +
25941 +#else
25942 +
25943 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
25944 +#endif
25945 +
25946  /*
25947   * Object location (<PFN>, <obj_idx>) is encoded as
25948   * as single (unsigned long) handle value.
25949 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
25950
25951  static int create_cache(struct zs_pool *pool)
25952  {
25953 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
25954 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
25955                                         0, 0, NULL);
25956         if (!pool->handle_cachep)
25957                 return 1;
25958 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
25959
25960  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
25961  {
25962 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
25963 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
25964 +       void *p;
25965 +
25966 +       p = kmem_cache_alloc(pool->handle_cachep,
25967 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
25968 +#ifdef CONFIG_PREEMPT_RT_FULL
25969 +       if (p) {
25970 +               struct zsmalloc_handle *zh = p;
25971 +
25972 +               mutex_init(&zh->lock);
25973 +       }
25974 +#endif
25975 +       return (unsigned long)p;
25976  }
25977
25978 +#ifdef CONFIG_PREEMPT_RT_FULL
25979 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
25980 +{
25981 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
25982 +}
25983 +#endif
25984 +
25985  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
25986  {
25987         kmem_cache_free(pool->handle_cachep, (void *)handle);
25988 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
25989
25990  static void record_obj(unsigned long handle, unsigned long obj)
25991  {
25992 +#ifdef CONFIG_PREEMPT_RT_FULL
25993 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
25994 +
25995 +       WRITE_ONCE(zh->addr, obj);
25996 +#else
25997         /*
25998          * lsb of @obj represents handle lock while other bits
25999          * represent object value the handle is pointing so
26000          * updating shouldn't do store tearing.
26001          */
26002         WRITE_ONCE(*(unsigned long *)handle, obj);
26003 +#endif
26004  }
26005
26006  /* zpool driver */
26007 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
26008
26009  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
26010  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
26011 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
26012
26013  static bool is_zspage_isolated(struct zspage *zspage)
26014  {
26015 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
26016
26017  static unsigned long handle_to_obj(unsigned long handle)
26018  {
26019 +#ifdef CONFIG_PREEMPT_RT_FULL
26020 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26021 +
26022 +       return zh->addr;
26023 +#else
26024         return *(unsigned long *)handle;
26025 +#endif
26026  }
26027
26028  static unsigned long obj_to_head(struct page *page, void *obj)
26029 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
26030
26031  static inline int testpin_tag(unsigned long handle)
26032  {
26033 +#ifdef CONFIG_PREEMPT_RT_FULL
26034 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26035 +
26036 +       return mutex_is_locked(&zh->lock);
26037 +#else
26038         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
26039 +#endif
26040  }
26041
26042  static inline int trypin_tag(unsigned long handle)
26043  {
26044 +#ifdef CONFIG_PREEMPT_RT_FULL
26045 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26046 +
26047 +       return mutex_trylock(&zh->lock);
26048 +#else
26049         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
26050 +#endif
26051  }
26052
26053  static void pin_tag(unsigned long handle)
26054  {
26055 +#ifdef CONFIG_PREEMPT_RT_FULL
26056 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26057 +
26058 +       return mutex_lock(&zh->lock);
26059 +#else
26060         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
26061 +#endif
26062  }
26063
26064  static void unpin_tag(unsigned long handle)
26065  {
26066 +#ifdef CONFIG_PREEMPT_RT_FULL
26067 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
26068 +
26069 +       return mutex_unlock(&zh->lock);
26070 +#else
26071         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
26072 +#endif
26073  }
26074
26075  static void reset_page(struct page *page)
26076 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
26077         class = pool->size_class[class_idx];
26078         off = (class->size * obj_idx) & ~PAGE_MASK;
26079
26080 -       area = &get_cpu_var(zs_map_area);
26081 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
26082         area->vm_mm = mm;
26083         if (off + class->size <= PAGE_SIZE) {
26084                 /* this object is contained entirely within a page */
26085 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
26086
26087                 __zs_unmap_object(area, pages, off, class->size);
26088         }
26089 -       put_cpu_var(zs_map_area);
26090 +       put_locked_var(zs_map_area_lock, zs_map_area);
26091
26092         migrate_read_unlock(zspage);
26093         unpin_tag(handle);
26094 diff --git a/net/core/dev.c b/net/core/dev.c
26095 index 2e04fd188081..3ba60ef8c79e 100644
26096 --- a/net/core/dev.c
26097 +++ b/net/core/dev.c
26098 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
26099  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
26100
26101  static seqcount_t devnet_rename_seq;
26102 +static DEFINE_MUTEX(devnet_rename_mutex);
26103
26104  static inline void dev_base_seq_inc(struct net *net)
26105  {
26106 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
26107  static inline void rps_lock(struct softnet_data *sd)
26108  {
26109  #ifdef CONFIG_RPS
26110 -       spin_lock(&sd->input_pkt_queue.lock);
26111 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
26112  #endif
26113  }
26114
26115  static inline void rps_unlock(struct softnet_data *sd)
26116  {
26117  #ifdef CONFIG_RPS
26118 -       spin_unlock(&sd->input_pkt_queue.lock);
26119 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
26120  #endif
26121  }
26122
26123 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
26124         strcpy(name, dev->name);
26125         rcu_read_unlock();
26126         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
26127 -               cond_resched();
26128 +               mutex_lock(&devnet_rename_mutex);
26129 +               mutex_unlock(&devnet_rename_mutex);
26130                 goto retry;
26131         }
26132
26133 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
26134         if (dev->flags & IFF_UP)
26135                 return -EBUSY;
26136
26137 -       write_seqcount_begin(&devnet_rename_seq);
26138 +       mutex_lock(&devnet_rename_mutex);
26139 +       __raw_write_seqcount_begin(&devnet_rename_seq);
26140
26141 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
26142 -               write_seqcount_end(&devnet_rename_seq);
26143 -               return 0;
26144 -       }
26145 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
26146 +               goto outunlock;
26147
26148         memcpy(oldname, dev->name, IFNAMSIZ);
26149
26150         err = dev_get_valid_name(net, dev, newname);
26151 -       if (err < 0) {
26152 -               write_seqcount_end(&devnet_rename_seq);
26153 -               return err;
26154 -       }
26155 +       if (err < 0)
26156 +               goto outunlock;
26157
26158         if (oldname[0] && !strchr(oldname, '%'))
26159                 netdev_info(dev, "renamed from %s\n", oldname);
26160 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
26161         if (ret) {
26162                 memcpy(dev->name, oldname, IFNAMSIZ);
26163                 dev->name_assign_type = old_assign_type;
26164 -               write_seqcount_end(&devnet_rename_seq);
26165 -               return ret;
26166 +               err = ret;
26167 +               goto outunlock;
26168         }
26169
26170 -       write_seqcount_end(&devnet_rename_seq);
26171 +       __raw_write_seqcount_end(&devnet_rename_seq);
26172 +       mutex_unlock(&devnet_rename_mutex);
26173
26174         netdev_adjacent_rename_links(dev, oldname);
26175
26176 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
26177                 /* err >= 0 after dev_alloc_name() or stores the first errno */
26178                 if (err >= 0) {
26179                         err = ret;
26180 -                       write_seqcount_begin(&devnet_rename_seq);
26181 +                       mutex_lock(&devnet_rename_mutex);
26182 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
26183                         memcpy(dev->name, oldname, IFNAMSIZ);
26184                         memcpy(oldname, newname, IFNAMSIZ);
26185                         dev->name_assign_type = old_assign_type;
26186 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
26187         }
26188
26189         return err;
26190 +
26191 +outunlock:
26192 +       __raw_write_seqcount_end(&devnet_rename_seq);
26193 +       mutex_unlock(&devnet_rename_mutex);
26194 +       return err;
26195  }
26196
26197  /**
26198 @@ -2285,6 +2291,7 @@ static void __netif_reschedule(struct Qdisc *q)
26199         sd->output_queue_tailp = &q->next_sched;
26200         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26201         local_irq_restore(flags);
26202 +       preempt_check_resched_rt();
26203  }
26204
26205  void __netif_schedule(struct Qdisc *q)
26206 @@ -2366,6 +2373,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
26207         __this_cpu_write(softnet_data.completion_queue, skb);
26208         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26209         local_irq_restore(flags);
26210 +       preempt_check_resched_rt();
26211  }
26212  EXPORT_SYMBOL(__dev_kfree_skb_irq);
26213
26214 @@ -3100,7 +3108,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
26215          * This permits qdisc->running owner to get the lock more
26216          * often and dequeue packets faster.
26217          */
26218 +#ifdef CONFIG_PREEMPT_RT_FULL
26219 +       contended = true;
26220 +#else
26221         contended = qdisc_is_running(q);
26222 +#endif
26223         if (unlikely(contended))
26224                 spin_lock(&q->busylock);
26225
26226 @@ -3163,8 +3175,10 @@ static void skb_update_prio(struct sk_buff *skb)
26227  #define skb_update_prio(skb)
26228  #endif
26229
26230 +#ifndef CONFIG_PREEMPT_RT_FULL
26231  DEFINE_PER_CPU(int, xmit_recursion);
26232  EXPORT_SYMBOL(xmit_recursion);
26233 +#endif
26234
26235  /**
26236   *     dev_loopback_xmit - loop back @skb
26237 @@ -3398,8 +3412,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
26238                 int cpu = smp_processor_id(); /* ok because BHs are off */
26239
26240                 if (txq->xmit_lock_owner != cpu) {
26241 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
26242 -                                    XMIT_RECURSION_LIMIT))
26243 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
26244                                 goto recursion_alert;
26245
26246                         skb = validate_xmit_skb(skb, dev);
26247 @@ -3409,9 +3422,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
26248                         HARD_TX_LOCK(dev, txq, cpu);
26249
26250                         if (!netif_xmit_stopped(txq)) {
26251 -                               __this_cpu_inc(xmit_recursion);
26252 +                               xmit_rec_inc();
26253                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
26254 -                               __this_cpu_dec(xmit_recursion);
26255 +                               xmit_rec_dec();
26256                                 if (dev_xmit_complete(rc)) {
26257                                         HARD_TX_UNLOCK(dev, txq);
26258                                         goto out;
26259 @@ -3785,6 +3798,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
26260         rps_unlock(sd);
26261
26262         local_irq_restore(flags);
26263 +       preempt_check_resched_rt();
26264
26265         atomic_long_inc(&skb->dev->rx_dropped);
26266         kfree_skb(skb);
26267 @@ -3803,7 +3817,7 @@ static int netif_rx_internal(struct sk_buff *skb)
26268                 struct rps_dev_flow voidflow, *rflow = &voidflow;
26269                 int cpu;
26270
26271 -               preempt_disable();
26272 +               migrate_disable();
26273                 rcu_read_lock();
26274
26275                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
26276 @@ -3813,13 +3827,13 @@ static int netif_rx_internal(struct sk_buff *skb)
26277                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
26278
26279                 rcu_read_unlock();
26280 -               preempt_enable();
26281 +               migrate_enable();
26282         } else
26283  #endif
26284         {
26285                 unsigned int qtail;
26286 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
26287 -               put_cpu();
26288 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
26289 +               put_cpu_light();
26290         }
26291         return ret;
26292  }
26293 @@ -3853,11 +3867,9 @@ int netif_rx_ni(struct sk_buff *skb)
26294
26295         trace_netif_rx_ni_entry(skb);
26296
26297 -       preempt_disable();
26298 +       local_bh_disable();
26299         err = netif_rx_internal(skb);
26300 -       if (local_softirq_pending())
26301 -               do_softirq();
26302 -       preempt_enable();
26303 +       local_bh_enable();
26304
26305         return err;
26306  }
26307 @@ -4336,7 +4348,7 @@ static void flush_backlog(struct work_struct *work)
26308         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
26309                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
26310                         __skb_unlink(skb, &sd->input_pkt_queue);
26311 -                       kfree_skb(skb);
26312 +                       __skb_queue_tail(&sd->tofree_queue, skb);
26313                         input_queue_head_incr(sd);
26314                 }
26315         }
26316 @@ -4346,11 +4358,14 @@ static void flush_backlog(struct work_struct *work)
26317         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
26318                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
26319                         __skb_unlink(skb, &sd->process_queue);
26320 -                       kfree_skb(skb);
26321 +                       __skb_queue_tail(&sd->tofree_queue, skb);
26322                         input_queue_head_incr(sd);
26323                 }
26324         }
26325 +       if (!skb_queue_empty(&sd->tofree_queue))
26326 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
26327         local_bh_enable();
26328 +
26329  }
26330
26331  static void flush_all_backlogs(void)
26332 @@ -4831,6 +4846,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
26333                 sd->rps_ipi_list = NULL;
26334
26335                 local_irq_enable();
26336 +               preempt_check_resched_rt();
26337
26338                 /* Send pending IPI's to kick RPS processing on remote cpus. */
26339                 while (remsd) {
26340 @@ -4844,6 +4860,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
26341         } else
26342  #endif
26343                 local_irq_enable();
26344 +       preempt_check_resched_rt();
26345  }
26346
26347  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
26348 @@ -4873,7 +4890,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
26349         while (again) {
26350                 struct sk_buff *skb;
26351
26352 +               local_irq_disable();
26353                 while ((skb = __skb_dequeue(&sd->process_queue))) {
26354 +                       local_irq_enable();
26355                         rcu_read_lock();
26356                         __netif_receive_skb(skb);
26357                         rcu_read_unlock();
26358 @@ -4881,9 +4900,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
26359                         if (++work >= quota)
26360                                 return work;
26361
26362 +                       local_irq_disable();
26363                 }
26364
26365 -               local_irq_disable();
26366                 rps_lock(sd);
26367                 if (skb_queue_empty(&sd->input_pkt_queue)) {
26368                         /*
26369 @@ -4921,9 +4940,11 @@ void __napi_schedule(struct napi_struct *n)
26370         local_irq_save(flags);
26371         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
26372         local_irq_restore(flags);
26373 +       preempt_check_resched_rt();
26374  }
26375  EXPORT_SYMBOL(__napi_schedule);
26376
26377 +#ifndef CONFIG_PREEMPT_RT_FULL
26378  /**
26379   * __napi_schedule_irqoff - schedule for receive
26380   * @n: entry to schedule
26381 @@ -4935,6 +4956,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
26382         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
26383  }
26384  EXPORT_SYMBOL(__napi_schedule_irqoff);
26385 +#endif
26386
26387  void __napi_complete(struct napi_struct *n)
26388  {
26389 @@ -5224,13 +5246,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
26390         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
26391         unsigned long time_limit = jiffies + 2;
26392         int budget = netdev_budget;
26393 +       struct sk_buff_head tofree_q;
26394 +       struct sk_buff *skb;
26395         LIST_HEAD(list);
26396         LIST_HEAD(repoll);
26397
26398 +       __skb_queue_head_init(&tofree_q);
26399 +
26400         local_irq_disable();
26401 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
26402         list_splice_init(&sd->poll_list, &list);
26403         local_irq_enable();
26404
26405 +       while ((skb = __skb_dequeue(&tofree_q)))
26406 +               kfree_skb(skb);
26407 +
26408         for (;;) {
26409                 struct napi_struct *n;
26410
26411 @@ -5261,7 +5291,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
26412         list_splice_tail(&repoll, &list);
26413         list_splice(&list, &sd->poll_list);
26414         if (!list_empty(&sd->poll_list))
26415 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
26416 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
26417
26418         net_rps_action_and_irq_enable(sd);
26419  }
26420 @@ -8022,16 +8052,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
26421
26422         raise_softirq_irqoff(NET_TX_SOFTIRQ);
26423         local_irq_enable();
26424 +       preempt_check_resched_rt();
26425
26426         /* Process offline CPU's input_pkt_queue */
26427         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
26428                 netif_rx_ni(skb);
26429                 input_queue_head_incr(oldsd);
26430         }
26431 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
26432 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
26433                 netif_rx_ni(skb);
26434                 input_queue_head_incr(oldsd);
26435         }
26436 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
26437 +               kfree_skb(skb);
26438 +       }
26439
26440         return NOTIFY_OK;
26441  }
26442 @@ -8336,8 +8370,9 @@ static int __init net_dev_init(void)
26443
26444                 INIT_WORK(flush, flush_backlog);
26445
26446 -               skb_queue_head_init(&sd->input_pkt_queue);
26447 -               skb_queue_head_init(&sd->process_queue);
26448 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
26449 +               skb_queue_head_init_raw(&sd->process_queue);
26450 +               skb_queue_head_init_raw(&sd->tofree_queue);
26451                 INIT_LIST_HEAD(&sd->poll_list);
26452                 sd->output_queue_tailp = &sd->output_queue;
26453  #ifdef CONFIG_RPS
26454 diff --git a/net/core/filter.c b/net/core/filter.c
26455 index b391209838ef..b86e9681a88e 100644
26456 --- a/net/core/filter.c
26457 +++ b/net/core/filter.c
26458 @@ -1645,7 +1645,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
26459  {
26460         int ret;
26461
26462 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
26463 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
26464                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
26465                 kfree_skb(skb);
26466                 return -ENETDOWN;
26467 @@ -1653,9 +1653,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
26468
26469         skb->dev = dev;
26470
26471 -       __this_cpu_inc(xmit_recursion);
26472 +       xmit_rec_inc();
26473         ret = dev_queue_xmit(skb);
26474 -       __this_cpu_dec(xmit_recursion);
26475 +       xmit_rec_dec();
26476
26477         return ret;
26478  }
26479 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
26480 index cad8e791f28e..2a9364fe62a5 100644
26481 --- a/net/core/gen_estimator.c
26482 +++ b/net/core/gen_estimator.c
26483 @@ -84,7 +84,7 @@ struct gen_estimator
26484         struct gnet_stats_basic_packed  *bstats;
26485         struct gnet_stats_rate_est64    *rate_est;
26486         spinlock_t              *stats_lock;
26487 -       seqcount_t              *running;
26488 +       net_seqlock_t           *running;
26489         int                     ewma_log;
26490         u32                     last_packets;
26491         unsigned long           avpps;
26492 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
26493                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
26494                       struct gnet_stats_rate_est64 *rate_est,
26495                       spinlock_t *stats_lock,
26496 -                     seqcount_t *running,
26497 +                     net_seqlock_t *running,
26498                       struct nlattr *opt)
26499  {
26500         struct gen_estimator *est;
26501 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
26502                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
26503                           struct gnet_stats_rate_est64 *rate_est,
26504                           spinlock_t *stats_lock,
26505 -                         seqcount_t *running, struct nlattr *opt)
26506 +                         net_seqlock_t *running, struct nlattr *opt)
26507  {
26508         gen_kill_estimator(bstats, rate_est);
26509         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
26510 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
26511 index 508e051304fb..bc3b17b78c94 100644
26512 --- a/net/core/gen_stats.c
26513 +++ b/net/core/gen_stats.c
26514 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
26515  }
26516
26517  void
26518 -__gnet_stats_copy_basic(const seqcount_t *running,
26519 +__gnet_stats_copy_basic(net_seqlock_t *running,
26520                         struct gnet_stats_basic_packed *bstats,
26521                         struct gnet_stats_basic_cpu __percpu *cpu,
26522                         struct gnet_stats_basic_packed *b)
26523 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
26524         }
26525         do {
26526                 if (running)
26527 -                       seq = read_seqcount_begin(running);
26528 +                       seq = net_seq_begin(running);
26529                 bstats->bytes = b->bytes;
26530                 bstats->packets = b->packets;
26531 -       } while (running && read_seqcount_retry(running, seq));
26532 +       } while (running && net_seq_retry(running, seq));
26533  }
26534  EXPORT_SYMBOL(__gnet_stats_copy_basic);
26535
26536 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
26537   * if the room in the socket buffer was not sufficient.
26538   */
26539  int
26540 -gnet_stats_copy_basic(const seqcount_t *running,
26541 +gnet_stats_copy_basic(net_seqlock_t *running,
26542                       struct gnet_dump *d,
26543                       struct gnet_stats_basic_cpu __percpu *cpu,
26544                       struct gnet_stats_basic_packed *b)
26545 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
26546 index fe008f1bd930..9fa6bea3dd3f 100644
26547 --- a/net/core/skbuff.c
26548 +++ b/net/core/skbuff.c
26549 @@ -64,6 +64,7 @@
26550  #include <linux/errqueue.h>
26551  #include <linux/prefetch.h>
26552  #include <linux/if_vlan.h>
26553 +#include <linux/locallock.h>
26554
26555  #include <net/protocol.h>
26556  #include <net/dst.h>
26557 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
26558
26559  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
26560  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
26561 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
26562 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
26563
26564  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
26565  {
26566 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
26567         unsigned long flags;
26568         void *data;
26569
26570 -       local_irq_save(flags);
26571 +       local_lock_irqsave(netdev_alloc_lock, flags);
26572         nc = this_cpu_ptr(&netdev_alloc_cache);
26573         data = __alloc_page_frag(nc, fragsz, gfp_mask);
26574 -       local_irq_restore(flags);
26575 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
26576         return data;
26577  }
26578
26579 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
26580
26581  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
26582  {
26583 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
26584 +       struct napi_alloc_cache *nc;
26585 +       void *data;
26586
26587 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
26588 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26589 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
26590 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26591 +       return data;
26592  }
26593
26594  void *napi_alloc_frag(unsigned int fragsz)
26595 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
26596         if (sk_memalloc_socks())
26597                 gfp_mask |= __GFP_MEMALLOC;
26598
26599 -       local_irq_save(flags);
26600 +       local_lock_irqsave(netdev_alloc_lock, flags);
26601
26602         nc = this_cpu_ptr(&netdev_alloc_cache);
26603         data = __alloc_page_frag(nc, len, gfp_mask);
26604         pfmemalloc = nc->pfmemalloc;
26605
26606 -       local_irq_restore(flags);
26607 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
26608
26609         if (unlikely(!data))
26610                 return NULL;
26611 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
26612  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
26613                                  gfp_t gfp_mask)
26614  {
26615 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
26616 +       struct napi_alloc_cache *nc;
26617         struct sk_buff *skb;
26618         void *data;
26619 +       bool pfmemalloc;
26620
26621         len += NET_SKB_PAD + NET_IP_ALIGN;
26622
26623 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
26624         if (sk_memalloc_socks())
26625                 gfp_mask |= __GFP_MEMALLOC;
26626
26627 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26628         data = __alloc_page_frag(&nc->page, len, gfp_mask);
26629 +       pfmemalloc = nc->page.pfmemalloc;
26630 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26631         if (unlikely(!data))
26632                 return NULL;
26633
26634 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
26635         }
26636
26637         /* use OR instead of assignment to avoid clearing of bits in mask */
26638 -       if (nc->page.pfmemalloc)
26639 +       if (pfmemalloc)
26640                 skb->pfmemalloc = 1;
26641         skb->head_frag = 1;
26642
26643 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
26644
26645  void __kfree_skb_flush(void)
26646  {
26647 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
26648 +       struct napi_alloc_cache *nc;
26649
26650 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26651         /* flush skb_cache if containing objects */
26652         if (nc->skb_count) {
26653                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
26654                                      nc->skb_cache);
26655                 nc->skb_count = 0;
26656         }
26657 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26658  }
26659
26660  static inline void _kfree_skb_defer(struct sk_buff *skb)
26661  {
26662 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
26663 +       struct napi_alloc_cache *nc;
26664
26665         /* drop skb->head and call any destructors for packet */
26666         skb_release_all(skb);
26667
26668 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26669         /* record skb to CPU local list */
26670         nc->skb_cache[nc->skb_count++] = skb;
26671
26672 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
26673                                      nc->skb_cache);
26674                 nc->skb_count = 0;
26675         }
26676 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
26677  }
26678  void __kfree_skb_defer(struct sk_buff *skb)
26679  {
26680 diff --git a/net/core/sock.c b/net/core/sock.c
26681 index 470a2043b846..2b09a5a33d8d 100644
26682 --- a/net/core/sock.c
26683 +++ b/net/core/sock.c
26684 @@ -2499,12 +2499,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
26685         if (sk->sk_lock.owned)
26686                 __lock_sock(sk);
26687         sk->sk_lock.owned = 1;
26688 -       spin_unlock(&sk->sk_lock.slock);
26689 +       spin_unlock_bh(&sk->sk_lock.slock);
26690         /*
26691          * The sk_lock has mutex_lock() semantics here:
26692          */
26693         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
26694 -       local_bh_enable();
26695  }
26696  EXPORT_SYMBOL(lock_sock_nested);
26697
26698 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
26699 index 48734ee6293f..e6864ff11352 100644
26700 --- a/net/ipv4/icmp.c
26701 +++ b/net/ipv4/icmp.c
26702 @@ -69,6 +69,7 @@
26703  #include <linux/jiffies.h>
26704  #include <linux/kernel.h>
26705  #include <linux/fcntl.h>
26706 +#include <linux/sysrq.h>
26707  #include <linux/socket.h>
26708  #include <linux/in.h>
26709  #include <linux/inet.h>
26710 @@ -77,6 +78,7 @@
26711  #include <linux/string.h>
26712  #include <linux/netfilter_ipv4.h>
26713  #include <linux/slab.h>
26714 +#include <linux/locallock.h>
26715  #include <net/snmp.h>
26716  #include <net/ip.h>
26717  #include <net/route.h>
26718 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
26719   *
26720   *     On SMP we have one ICMP socket per-cpu.
26721   */
26722 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
26723 +
26724  static struct sock *icmp_sk(struct net *net)
26725  {
26726         return *this_cpu_ptr(net->ipv4.icmp_sk);
26727 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
26728
26729         local_bh_disable();
26730
26731 +       local_lock(icmp_sk_lock);
26732         sk = icmp_sk(net);
26733
26734         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
26735                 /* This can happen if the output path signals a
26736                  * dst_link_failure() for an outgoing ICMP packet.
26737                  */
26738 +               local_unlock(icmp_sk_lock);
26739                 local_bh_enable();
26740                 return NULL;
26741         }
26742 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
26743  static inline void icmp_xmit_unlock(struct sock *sk)
26744  {
26745         spin_unlock_bh(&sk->sk_lock.slock);
26746 +       local_unlock(icmp_sk_lock);
26747  }
26748
26749  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
26750 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
26751         struct sock *sk;
26752         struct sk_buff *skb;
26753
26754 +       local_lock(icmp_sk_lock);
26755         sk = icmp_sk(dev_net((*rt)->dst.dev));
26756         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
26757                            icmp_param->data_len+icmp_param->head_len,
26758 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
26759                 skb->ip_summed = CHECKSUM_NONE;
26760                 ip_push_pending_frames(sk, fl4);
26761         }
26762 +       local_unlock(icmp_sk_lock);
26763  }
26764
26765  /*
26766 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
26767  }
26768
26769  /*
26770 + * 32bit and 64bit have different timestamp length, so we check for
26771 + * the cookie at offset 20 and verify it is repeated at offset 50
26772 + */
26773 +#define CO_POS0                20
26774 +#define CO_POS1                50
26775 +#define CO_SIZE                sizeof(int)
26776 +#define ICMP_SYSRQ_SIZE        57
26777 +
26778 +/*
26779 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
26780 + * pattern and if it matches send the next byte as a trigger to sysrq.
26781 + */
26782 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
26783 +{
26784 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
26785 +       char *p = skb->data;
26786 +
26787 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
26788 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
26789 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
26790 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
26791 +}
26792 +
26793 +/*
26794   *     Handle ICMP_ECHO ("ping") requests.
26795   *
26796   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
26797 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
26798                 icmp_param.data_len        = skb->len;
26799                 icmp_param.head_len        = sizeof(struct icmphdr);
26800                 icmp_reply(&icmp_param, skb);
26801 +
26802 +               if (skb->len == ICMP_SYSRQ_SIZE &&
26803 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
26804 +                       icmp_check_sysrq(net, skb);
26805 +               }
26806         }
26807         /* should there be an ICMP stat for ignored echos? */
26808         return true;
26809 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
26810 index 80bc36b25de2..215b90adfb05 100644
26811 --- a/net/ipv4/sysctl_net_ipv4.c
26812 +++ b/net/ipv4/sysctl_net_ipv4.c
26813 @@ -681,6 +681,13 @@ static struct ctl_table ipv4_net_table[] = {
26814                 .proc_handler   = proc_dointvec
26815         },
26816         {
26817 +               .procname       = "icmp_echo_sysrq",
26818 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
26819 +               .maxlen         = sizeof(int),
26820 +               .mode           = 0644,
26821 +               .proc_handler   = proc_dointvec
26822 +       },
26823 +       {
26824                 .procname       = "icmp_ignore_bogus_error_responses",
26825                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
26826                 .maxlen         = sizeof(int),
26827 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
26828 index 6988566dc72f..672fffcde28c 100644
26829 --- a/net/ipv4/tcp_ipv4.c
26830 +++ b/net/ipv4/tcp_ipv4.c
26831 @@ -62,6 +62,7 @@
26832  #include <linux/init.h>
26833  #include <linux/times.h>
26834  #include <linux/slab.h>
26835 +#include <linux/locallock.h>
26836
26837  #include <net/net_namespace.h>
26838  #include <net/icmp.h>
26839 @@ -568,6 +569,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
26840  }
26841  EXPORT_SYMBOL(tcp_v4_send_check);
26842
26843 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
26844  /*
26845   *     This routine will send an RST to the other tcp.
26846   *
26847 @@ -695,6 +697,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
26848                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
26849
26850         arg.tos = ip_hdr(skb)->tos;
26851 +
26852 +       local_lock(tcp_sk_lock);
26853         local_bh_disable();
26854         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
26855                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
26856 @@ -704,6 +708,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
26857         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
26858         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
26859         local_bh_enable();
26860 +       local_unlock(tcp_sk_lock);
26861
26862  #ifdef CONFIG_TCP_MD5SIG
26863  out:
26864 @@ -779,6 +784,7 @@ static void tcp_v4_send_ack(struct net *net,
26865         if (oif)
26866                 arg.bound_dev_if = oif;
26867         arg.tos = tos;
26868 +       local_lock(tcp_sk_lock);
26869         local_bh_disable();
26870         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
26871                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
26872 @@ -787,6 +793,7 @@ static void tcp_v4_send_ack(struct net *net,
26873
26874         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
26875         local_bh_enable();
26876 +       local_unlock(tcp_sk_lock);
26877  }
26878
26879  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
26880 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
26881 index acaaf616da71..09020dbcc089 100644
26882 --- a/net/mac80211/rx.c
26883 +++ b/net/mac80211/rx.c
26884 @@ -4230,7 +4230,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
26885         struct ieee80211_supported_band *sband;
26886         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
26887
26888 -       WARN_ON_ONCE(softirq_count() == 0);
26889 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
26890
26891         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
26892                 goto drop;
26893 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
26894 index 004af030ef1a..b64f751bda45 100644
26895 --- a/net/netfilter/core.c
26896 +++ b/net/netfilter/core.c
26897 @@ -22,12 +22,18 @@
26898  #include <linux/proc_fs.h>
26899  #include <linux/mutex.h>
26900  #include <linux/slab.h>
26901 +#include <linux/locallock.h>
26902  #include <linux/rcupdate.h>
26903  #include <net/net_namespace.h>
26904  #include <net/sock.h>
26905
26906  #include "nf_internals.h"
26907
26908 +#ifdef CONFIG_PREEMPT_RT_BASE
26909 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
26910 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
26911 +#endif
26912 +
26913  static DEFINE_MUTEX(afinfo_mutex);
26914
26915  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
26916 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
26917 index cb76ff3088e9..3f42c5b1af55 100644
26918 --- a/net/packet/af_packet.c
26919 +++ b/net/packet/af_packet.c
26920 @@ -63,6 +63,7 @@
26921  #include <linux/if_packet.h>
26922  #include <linux/wireless.h>
26923  #include <linux/kernel.h>
26924 +#include <linux/delay.h>
26925  #include <linux/kmod.h>
26926  #include <linux/slab.h>
26927  #include <linux/vmalloc.h>
26928 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
26929         if (BLOCK_NUM_PKTS(pbd)) {
26930                 while (atomic_read(&pkc->blk_fill_in_prog)) {
26931                         /* Waiting for skb_copy_bits to finish... */
26932 -                       cpu_relax();
26933 +                       cpu_chill();
26934                 }
26935         }
26936
26937 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
26938                 if (!(status & TP_STATUS_BLK_TMO)) {
26939                         while (atomic_read(&pkc->blk_fill_in_prog)) {
26940                                 /* Waiting for skb_copy_bits to finish... */
26941 -                               cpu_relax();
26942 +                               cpu_chill();
26943                         }
26944                 }
26945                 prb_close_block(pkc, pbd, po, status);
26946 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
26947 index 977f69886c00..f3e7a36b0396 100644
26948 --- a/net/rds/ib_rdma.c
26949 +++ b/net/rds/ib_rdma.c
26950 @@ -34,6 +34,7 @@
26951  #include <linux/slab.h>
26952  #include <linux/rculist.h>
26953  #include <linux/llist.h>
26954 +#include <linux/delay.h>
26955
26956  #include "rds_single_path.h"
26957  #include "ib_mr.h"
26958 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
26959         for_each_online_cpu(cpu) {
26960                 flag = &per_cpu(clean_list_grace, cpu);
26961                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
26962 -                       cpu_relax();
26963 +                       cpu_chill();
26964         }
26965  }
26966
26967 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
26968 index 7d921e56e715..13df56a738e5 100644
26969 --- a/net/rxrpc/security.c
26970 +++ b/net/rxrpc/security.c
26971 @@ -19,9 +19,6 @@
26972  #include <keys/rxrpc-type.h>
26973  #include "ar-internal.h"
26974
26975 -static LIST_HEAD(rxrpc_security_methods);
26976 -static DECLARE_RWSEM(rxrpc_security_sem);
26977 -
26978  static const struct rxrpc_security *rxrpc_security_types[] = {
26979         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
26980  #ifdef CONFIG_RXKAD
26981 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
26982 index 206dc24add3a..00ea9bde5bb3 100644
26983 --- a/net/sched/sch_api.c
26984 +++ b/net/sched/sch_api.c
26985 @@ -981,7 +981,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
26986                         rcu_assign_pointer(sch->stab, stab);
26987                 }
26988                 if (tca[TCA_RATE]) {
26989 -                       seqcount_t *running;
26990 +                       net_seqlock_t *running;
26991
26992                         err = -EOPNOTSUPP;
26993                         if (sch->flags & TCQ_F_MQROOT)
26994 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
26995 index 6cfb6e9038c2..20727e1347de 100644
26996 --- a/net/sched/sch_generic.c
26997 +++ b/net/sched/sch_generic.c
26998 @@ -425,7 +425,11 @@ struct Qdisc noop_qdisc = {
26999         .ops            =       &noop_qdisc_ops,
27000         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
27001         .dev_queue      =       &noop_netdev_queue,
27002 +#ifdef CONFIG_PREEMPT_RT_BASE
27003 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
27004 +#else
27005         .running        =       SEQCNT_ZERO(noop_qdisc.running),
27006 +#endif
27007         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
27008  };
27009  EXPORT_SYMBOL(noop_qdisc);
27010 @@ -624,9 +628,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
27011         lockdep_set_class(&sch->busylock,
27012                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
27013
27014 +#ifdef CONFIG_PREEMPT_RT_BASE
27015 +       seqlock_init(&sch->running);
27016 +       lockdep_set_class(&sch->running.seqcount,
27017 +                         dev->qdisc_running_key ?: &qdisc_running_key);
27018 +       lockdep_set_class(&sch->running.lock,
27019 +                         dev->qdisc_running_key ?: &qdisc_running_key);
27020 +#else
27021         seqcount_init(&sch->running);
27022         lockdep_set_class(&sch->running,
27023                           dev->qdisc_running_key ?: &qdisc_running_key);
27024 +#endif
27025
27026         sch->ops = ops;
27027         sch->enqueue = ops->enqueue;
27028 @@ -925,7 +937,7 @@ void dev_deactivate_many(struct list_head *head)
27029         /* Wait for outstanding qdisc_run calls. */
27030         list_for_each_entry(dev, head, close_list)
27031                 while (some_qdisc_is_busy(dev))
27032 -                       yield();
27033 +                       msleep(1);
27034  }
27035
27036  void dev_deactivate(struct net_device *dev)
27037 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
27038 index 9c9db55a0c1e..e6583b018a72 100644
27039 --- a/net/sunrpc/svc_xprt.c
27040 +++ b/net/sunrpc/svc_xprt.c
27041 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27042                 goto out;
27043         }
27044
27045 -       cpu = get_cpu();
27046 +       cpu = get_cpu_light();
27047         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
27048
27049         atomic_long_inc(&pool->sp_stats.packets);
27050 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27051
27052                 atomic_long_inc(&pool->sp_stats.threads_woken);
27053                 wake_up_process(rqstp->rq_task);
27054 -               put_cpu();
27055 +               put_cpu_light();
27056                 goto out;
27057         }
27058         rcu_read_unlock();
27059 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
27060                 goto redo_search;
27061         }
27062         rqstp = NULL;
27063 -       put_cpu();
27064 +       put_cpu_light();
27065  out:
27066         trace_svc_xprt_do_enqueue(xprt, rqstp);
27067  }
27068 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
27069 index 6fdc97ef6023..523e0420d7f0 100755
27070 --- a/scripts/mkcompile_h
27071 +++ b/scripts/mkcompile_h
27072 @@ -4,7 +4,8 @@ TARGET=$1
27073  ARCH=$2
27074  SMP=$3
27075  PREEMPT=$4
27076 -CC=$5
27077 +RT=$5
27078 +CC=$6
27079
27080  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
27081
27082 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
27083  CONFIG_FLAGS=""
27084  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
27085  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
27086 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
27087  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
27088
27089  # Truncate to maximum length
27090 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
27091 index 9d33c1e85c79..3d307bda86f9 100644
27092 --- a/sound/core/pcm_native.c
27093 +++ b/sound/core/pcm_native.c
27094 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
27095  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
27096  {
27097         if (!substream->pcm->nonatomic)
27098 -               local_irq_disable();
27099 +               local_irq_disable_nort();
27100         snd_pcm_stream_lock(substream);
27101  }
27102  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
27103 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
27104  {
27105         snd_pcm_stream_unlock(substream);
27106         if (!substream->pcm->nonatomic)
27107 -               local_irq_enable();
27108 +               local_irq_enable_nort();
27109  }
27110  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
27111
27112 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
27113  {
27114         unsigned long flags = 0;
27115         if (!substream->pcm->nonatomic)
27116 -               local_irq_save(flags);
27117 +               local_irq_save_nort(flags);
27118         snd_pcm_stream_lock(substream);
27119         return flags;
27120  }
27121 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
27122  {
27123         snd_pcm_stream_unlock(substream);
27124         if (!substream->pcm->nonatomic)
27125 -               local_irq_restore(flags);
27126 +               local_irq_restore_nort(flags);
27127  }
27128  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
27129