kernel-rt.patch

   1 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
   2 index 3a3b30ac2a75..9e0745cafbd8 100644
   3 --- a/Documentation/sysrq.txt
   4 +++ b/Documentation/sysrq.txt
   5 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
   6  On other - If you know of the key combos for other architectures, please
   7             let me know so I can add them to this section.
   8
   9 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
  10 -
  11 +On all -  write a character to /proc/sysrq-trigger, e.g.:
  12                 echo t > /proc/sysrq-trigger
  13
  14 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
  15 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
  16 +        Send an ICMP echo request with this pattern plus the particular
  17 +        SysRq command key. Example:
  18 +               # ping -c1 -s57 -p0102030468
  19 +        will trigger the SysRq-H (help) command.
  20 +
  21 +
  22  *  What are the 'command' keys?
  23  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  24  'b'     - Will immediately reboot the system without syncing or unmounting
  25 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
  26 new file mode 100644
  27 index 000000000000..6f2aeabf7faa
  28 --- /dev/null
  29 +++ b/Documentation/trace/histograms.txt
  30 @@ -0,0 +1,186 @@
  31 +               Using the Linux Kernel Latency Histograms
  32 +
  33 +
  34 +This document gives a short explanation how to enable, configure and use
  35 +latency histograms. Latency histograms are primarily relevant in the
  36 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
  37 +and are used in the quality management of the Linux real-time
  38 +capabilities.
  39 +
  40 +
  41 +* Purpose of latency histograms
  42 +
  43 +A latency histogram continuously accumulates the frequencies of latency
  44 +data. There are two types of histograms
  45 +- potential sources of latencies
  46 +- effective latencies
  47 +
  48 +
  49 +* Potential sources of latencies
  50 +
  51 +Potential sources of latencies are code segments where interrupts,
  52 +preemption or both are disabled (aka critical sections). To create
  53 +histograms of potential sources of latency, the kernel stores the time
  54 +stamp at the start of a critical section, determines the time elapsed
  55 +when the end of the section is reached, and increments the frequency
  56 +counter of that latency value - irrespective of whether any concurrently
  57 +running process is affected by latency or not.
  58 +- Configuration items (in the Kernel hacking/Tracers submenu)
  59 +  CONFIG_INTERRUPT_OFF_LATENCY
  60 +  CONFIG_PREEMPT_OFF_LATENCY
  61 +
  62 +
  63 +* Effective latencies
  64 +
  65 +Effective latencies are actually occuring during wakeup of a process. To
  66 +determine effective latencies, the kernel stores the time stamp when a
  67 +process is scheduled to be woken up, and determines the duration of the
  68 +wakeup time shortly before control is passed over to this process. Note
  69 +that the apparent latency in user space may be somewhat longer, since the
  70 +process may be interrupted after control is passed over to it but before
  71 +the execution in user space takes place. Simply measuring the interval
  72 +between enqueuing and wakeup may also not appropriate in cases when a
  73 +process is scheduled as a result of a timer expiration. The timer may have
  74 +missed its deadline, e.g. due to disabled interrupts, but this latency
  75 +would not be registered. Therefore, the offsets of missed timers are
  76 +recorded in a separate histogram. If both wakeup latency and missed timer
  77 +offsets are configured and enabled, a third histogram may be enabled that
  78 +records the overall latency as a sum of the timer latency, if any, and the
  79 +wakeup latency. This histogram is called "timerandwakeup".
  80 +- Configuration items (in the Kernel hacking/Tracers submenu)
  81 +  CONFIG_WAKEUP_LATENCY
  82 +  CONFIG_MISSED_TIMER_OFSETS
  83 +
  84 +
  85 +* Usage
  86 +
  87 +The interface to the administration of the latency histograms is located
  88 +in the debugfs file system. To mount it, either enter
  89 +
  90 +mount -t sysfs nodev /sys
  91 +mount -t debugfs nodev /sys/kernel/debug
  92 +
  93 +from shell command line level, or add
  94 +
  95 +nodev  /sys                    sysfs   defaults        0 0
  96 +nodev  /sys/kernel/debug       debugfs defaults        0 0
  97 +
  98 +to the file /etc/fstab. All latency histogram related files are then
  99 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
 100 +particular histogram type is enabled by writing non-zero to the related
 101 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
 102 +Select "preemptirqsoff" for the histograms of potential sources of
 103 +latencies and "wakeup" for histograms of effective latencies etc. The
 104 +histogram data - one per CPU - are available in the files
 105 +
 106 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
 107 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
 108 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
 109 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
 110 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
 111 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
 112 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
 113 +
 114 +The histograms are reset by writing non-zero to the file "reset" in a
 115 +particular latency directory. To reset all latency data, use
 116 +
 117 +#!/bin/sh
 118 +
 119 +TRACINGDIR=/sys/kernel/debug/tracing
 120 +HISTDIR=$TRACINGDIR/latency_hist
 121 +
 122 +if test -d $HISTDIR
 123 +then
 124 +  cd $HISTDIR
 125 +  for i in `find . | grep /reset$`
 126 +  do
 127 +    echo 1 >$i
 128 +  done
 129 +fi
 130 +
 131 +
 132 +* Data format
 133 +
 134 +Latency data are stored with a resolution of one microsecond. The
 135 +maximum latency is 10,240 microseconds. The data are only valid, if the
 136 +overflow register is empty. Every output line contains the latency in
 137 +microseconds in the first row and the number of samples in the second
 138 +row. To display only lines with a positive latency count, use, for
 139 +example,
 140 +
 141 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
 142 +
 143 +#Minimum latency: 0 microseconds.
 144 +#Average latency: 0 microseconds.
 145 +#Maximum latency: 25 microseconds.
 146 +#Total samples: 3104770694
 147 +#There are 0 samples greater or equal than 10240 microseconds
 148 +#usecs          samples
 149 +    0        2984486876
 150 +    1          49843506
 151 +    2          58219047
 152 +    3           5348126
 153 +    4           2187960
 154 +    5           3388262
 155 +    6            959289
 156 +    7            208294
 157 +    8             40420
 158 +    9              4485
 159 +   10             14918
 160 +   11             18340
 161 +   12             25052
 162 +   13             19455
 163 +   14              5602
 164 +   15               969
 165 +   16                47
 166 +   17                18
 167 +   18                14
 168 +   19                 1
 169 +   20                 3
 170 +   21                 2
 171 +   22                 5
 172 +   23                 2
 173 +   25                 1
 174 +
 175 +
 176 +* Wakeup latency of a selected process
 177 +
 178 +To only collect wakeup latency data of a particular process, write the
 179 +PID of the requested process to
 180 +
 181 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
 182 +
 183 +PIDs are not considered, if this variable is set to 0.
 184 +
 185 +
 186 +* Details of the process with the highest wakeup latency so far
 187 +
 188 +Selected data of the process that suffered from the highest wakeup
 189 +latency that occurred in a particular CPU are available in the file
 190 +
 191 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
 192 +
 193 +In addition, other relevant system data at the time when the
 194 +latency occurred are given.
 195 +
 196 +The format of the data is (all in one line):
 197 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
 198 +<- <PID> <Priority> <Command> <Timestamp>
 199 +
 200 +The value of <Timeroffset> is only relevant in the combined timer
 201 +and wakeup latency recording. In the wakeup recording, it is
 202 +always 0, in the missed_timer_offsets recording, it is the same
 203 +as <Latency>.
 204 +
 205 +When retrospectively searching for the origin of a latency and
 206 +tracing was not enabled, it may be helpful to know the name and
 207 +some basic data of the task that (finally) was switching to the
 208 +late real-tlme task. In addition to the victim's data, also the
 209 +data of the possible culprit are therefore displayed after the
 210 +"<-" symbol.
 211 +
 212 +Finally, the timestamp of the time when the latency occurred
 213 +in <seconds>.<microseconds> after the most recent system boot
 214 +is provided.
 215 +
 216 +These data are also reset when the wakeup histogram is reset.
 217 diff --git a/arch/Kconfig b/arch/Kconfig
 218 index 659bdd079277..099fc0f5155e 100644
 219 --- a/arch/Kconfig
 220 +++ b/arch/Kconfig
 221 @@ -9,6 +9,7 @@ config OPROFILE
 222         tristate "OProfile system profiling"
 223         depends on PROFILING
 224         depends on HAVE_OPROFILE
 225 +       depends on !PREEMPT_RT_FULL
 226         select RING_BUFFER
 227         select RING_BUFFER_ALLOW_SWAP
 228         help
 229 @@ -52,6 +53,7 @@ config KPROBES
 230  config JUMP_LABEL
 231         bool "Optimize very unlikely/likely branches"
 232         depends on HAVE_ARCH_JUMP_LABEL
 233 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
 234         help
 235           This option enables a transparent branch optimization that
 236          makes certain almost-always-true or almost-always-false branch
 237 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 238 index b5d529fdffab..5715844e83e3 100644
 239 --- a/arch/arm/Kconfig
 240 +++ b/arch/arm/Kconfig
 241 @@ -36,7 +36,7 @@ config ARM
 242         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 243         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 244         select HAVE_ARCH_HARDENED_USERCOPY
 245 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
 246 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
 247         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 248         select HAVE_ARCH_MMAP_RND_BITS if MMU
 249         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 250 @@ -75,6 +75,7 @@ config ARM
 251         select HAVE_PERF_EVENTS
 252         select HAVE_PERF_REGS
 253         select HAVE_PERF_USER_STACK_DUMP
 254 +       select HAVE_PREEMPT_LAZY
 255         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 256         select HAVE_REGS_AND_STACK_ACCESS_API
 257         select HAVE_SYSCALL_TRACEPOINTS
 258 diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h
 259 index e53638c8ed8a..6095a1649865 100644
 260 --- a/arch/arm/include/asm/irq.h
 261 +++ b/arch/arm/include/asm/irq.h
 262 @@ -22,6 +22,8 @@
 263  #endif
 264
 265  #ifndef __ASSEMBLY__
 266 +#include <linux/cpumask.h>
 267 +
 268  struct irqaction;
 269  struct pt_regs;
 270  extern void migrate_irqs(void);
 271 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
 272 index 12ebfcc1d539..c962084605bc 100644
 273 --- a/arch/arm/include/asm/switch_to.h
 274 +++ b/arch/arm/include/asm/switch_to.h
 275 @@ -3,6 +3,13 @@
 276
 277  #include <linux/thread_info.h>
 278
 279 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
 280 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
 281 +#else
 282 +static inline void
 283 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
 284 +#endif
 285 +
 286  /*
 287   * For v7 SMP cores running a preemptible kernel we may be pre-empted
 288   * during a TLB maintenance operation, so execute an inner-shareable dsb
 289 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 290  #define switch_to(prev,next,last)                                      \
 291  do {                                                                   \
 292         __complete_pending_tlbi();                                      \
 293 +       switch_kmaps(prev, next);                                       \
 294         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
 295  } while (0)
 296
 297 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
 298 index 776757d1604a..1f36a4eccc72 100644
 299 --- a/arch/arm/include/asm/thread_info.h
 300 +++ b/arch/arm/include/asm/thread_info.h
 301 @@ -49,6 +49,7 @@ struct cpu_context_save {
 302  struct thread_info {
 303         unsigned long           flags;          /* low level flags */
 304         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 305 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 306         mm_segment_t            addr_limit;     /* address limit */
 307         struct task_struct      *task;          /* main task structure */
 308         __u32                   cpu;            /* cpu */
 309 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 310  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
 311  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
 312  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
 313 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
 314 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
 315 +#define TIF_NEED_RESCHED_LAZY  7
 316
 317  #define TIF_NOHZ               12      /* in adaptive nohz mode */
 318  #define TIF_USING_IWMMXT       17
 319 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 320  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 321  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 322  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 323 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 324  #define _TIF_UPROBE            (1 << TIF_UPROBE)
 325  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 326  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 327 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 328   * Change these and you break ASM code in entry-common.S
 329   */
 330  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 331 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
 332 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 333 +                                _TIF_NEED_RESCHED_LAZY)
 334
 335  #endif /* __KERNEL__ */
 336  #endif /* __ASM_ARM_THREAD_INFO_H */
 337 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
 338 index 608008229c7d..3866da3f7bb7 100644
 339 --- a/arch/arm/kernel/asm-offsets.c
 340 +++ b/arch/arm/kernel/asm-offsets.c
 341 @@ -65,6 +65,7 @@ int main(void)
 342    BLANK();
 343    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 344    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 345 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 346    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 347    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 348    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 349 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
 350 index 9f157e7c51e7..468e224d76aa 100644
 351 --- a/arch/arm/kernel/entry-armv.S
 352 +++ b/arch/arm/kernel/entry-armv.S
 353 @@ -220,11 +220,18 @@ ENDPROC(__dabt_svc)
 354
 355  #ifdef CONFIG_PREEMPT
 356         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 357 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 358         teq     r8, #0                          @ if preempt count != 0
 359 +       bne     1f                              @ return from exeption
 360 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 361 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 362 +       blne    svc_preempt                     @ preempt!
 363 +
 364 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 365 +       teq     r8, #0                          @ if preempt lazy count != 0
 366         movne   r0, #0                          @ force flags to 0
 367 -       tst     r0, #_TIF_NEED_RESCHED
 368 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 369         blne    svc_preempt
 370 +1:
 371  #endif
 372
 373         svc_exit r5, irq = 1                    @ return from exception
 374 @@ -239,8 +246,14 @@ ENDPROC(__irq_svc)
 375  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 376         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 377         tst     r0, #_TIF_NEED_RESCHED
 378 +       bne     1b
 379 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 380         reteq   r8                              @ go again
 381 -       b       1b
 382 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 383 +       teq     r0, #0                          @ if preempt lazy count != 0
 384 +       beq     1b
 385 +       ret     r8                              @ go again
 386 +
 387  #endif
 388
 389  __und_fault:
 390 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
 391 index 10c3283d6c19..8872937862cc 100644
 392 --- a/arch/arm/kernel/entry-common.S
 393 +++ b/arch/arm/kernel/entry-common.S
 394 @@ -36,7 +36,9 @@
 395   UNWIND(.cantunwind    )
 396         disable_irq_notrace                     @ disable interrupts
 397         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 398 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 399 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 400 +       bne     fast_work_pending
 401 +       tst     r1, #_TIF_SECCOMP
 402         bne     fast_work_pending
 403
 404         /* perform architecture specific actions before user return */
 405 @@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
 406         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 407         disable_irq_notrace                     @ disable interrupts
 408         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 409 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 410 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 411 +       bne     do_slower_path
 412 +       tst     r1, #_TIF_SECCOMP
 413         beq     no_work_pending
 414 +do_slower_path:
 415   UNWIND(.fnend         )
 416  ENDPROC(ret_fast_syscall)
 417
 418 diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c
 419 index 69bda1a5707e..1f665acaa6a9 100644
 420 --- a/arch/arm/kernel/patch.c
 421 +++ b/arch/arm/kernel/patch.c
 422 @@ -15,7 +15,7 @@ struct patch {
 423         unsigned int insn;
 424  };
 425
 426 -static DEFINE_SPINLOCK(patch_lock);
 427 +static DEFINE_RAW_SPINLOCK(patch_lock);
 428
 429  static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 430         __acquires(&patch_lock)
 431 @@ -32,7 +32,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags)
 432                 return addr;
 433
 434         if (flags)
 435 -               spin_lock_irqsave(&patch_lock, *flags);
 436 +               raw_spin_lock_irqsave(&patch_lock, *flags);
 437         else
 438                 __acquire(&patch_lock);
 439
 440 @@ -47,7 +47,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags)
 441         clear_fixmap(fixmap);
 442
 443         if (flags)
 444 -               spin_unlock_irqrestore(&patch_lock, *flags);
 445 +               raw_spin_unlock_irqrestore(&patch_lock, *flags);
 446         else
 447                 __release(&patch_lock);
 448  }
 449 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
 450 index 91d2d5b01414..750550098b59 100644
 451 --- a/arch/arm/kernel/process.c
 452 +++ b/arch/arm/kernel/process.c
 453 @@ -322,6 +322,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 454  }
 455
 456  #ifdef CONFIG_MMU
 457 +/*
 458 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 459 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 460 + * fail.
 461 + */
 462 +static int __init vectors_user_mapping_init_page(void)
 463 +{
 464 +       struct page *page;
 465 +       unsigned long addr = 0xffff0000;
 466 +       pgd_t *pgd;
 467 +       pud_t *pud;
 468 +       pmd_t *pmd;
 469 +
 470 +       pgd = pgd_offset_k(addr);
 471 +       pud = pud_offset(pgd, addr);
 472 +       pmd = pmd_offset(pud, addr);
 473 +       page = pmd_page(*(pmd));
 474 +
 475 +       pgtable_page_ctor(page);
 476 +
 477 +       return 0;
 478 +}
 479 +late_initcall(vectors_user_mapping_init_page);
 480 +
 481  #ifdef CONFIG_KUSER_HELPERS
 482  /*
 483   * The vectors page is always readable from user space for the
 484 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
 485 index 7b8f2141427b..96541e00b74a 100644
 486 --- a/arch/arm/kernel/signal.c
 487 +++ b/arch/arm/kernel/signal.c
 488 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 489          */
 490         trace_hardirqs_off();
 491         do {
 492 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 493 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 494 +                                          _TIF_NEED_RESCHED_LAZY))) {
 495                         schedule();
 496                 } else {
 497                         if (unlikely(!user_mode(regs)))
 498 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 499 index 7dd14e8395e6..4cd7e3d98035 100644
 500 --- a/arch/arm/kernel/smp.c
 501 +++ b/arch/arm/kernel/smp.c
 502 @@ -234,8 +234,6 @@ int __cpu_disable(void)
 503         flush_cache_louis();
 504         local_flush_tlb_all();
 505
 506 -       clear_tasks_mm_cpumask(cpu);
 507 -
 508         return 0;
 509  }
 510
 511 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
 512                 pr_err("CPU%u: cpu didn't die\n", cpu);
 513                 return;
 514         }
 515 +
 516 +       clear_tasks_mm_cpumask(cpu);
 517 +
 518         pr_notice("CPU%u: shutdown\n", cpu);
 519
 520         /*
 521 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
 522 index 0bee233fef9a..314cfb232a63 100644
 523 --- a/arch/arm/kernel/unwind.c
 524 +++ b/arch/arm/kernel/unwind.c
 525 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 526  static const struct unwind_idx *__origin_unwind_idx;
 527  extern const struct unwind_idx __stop_unwind_idx[];
 528
 529 -static DEFINE_SPINLOCK(unwind_lock);
 530 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 531  static LIST_HEAD(unwind_tables);
 532
 533  /* Convert a prel31 symbol to an absolute address */
 534 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 535                 /* module unwind tables */
 536                 struct unwind_table *table;
 537
 538 -               spin_lock_irqsave(&unwind_lock, flags);
 539 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 540                 list_for_each_entry(table, &unwind_tables, list) {
 541                         if (addr >= table->begin_addr &&
 542                             addr < table->end_addr) {
 543 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 544                                 break;
 545                         }
 546                 }
 547 -               spin_unlock_irqrestore(&unwind_lock, flags);
 548 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 549         }
 550
 551         pr_debug("%s: idx = %p\n", __func__, idx);
 552 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 553         tab->begin_addr = text_addr;
 554         tab->end_addr = text_addr + text_size;
 555
 556 -       spin_lock_irqsave(&unwind_lock, flags);
 557 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 558         list_add_tail(&tab->list, &unwind_tables);
 559 -       spin_unlock_irqrestore(&unwind_lock, flags);
 560 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 561
 562         return tab;
 563  }
 564 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 565         if (!tab)
 566                 return;
 567
 568 -       spin_lock_irqsave(&unwind_lock, flags);
 569 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 570         list_del(&tab->list);
 571 -       spin_unlock_irqrestore(&unwind_lock, flags);
 572 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 573
 574         kfree(tab);
 575  }
 576 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 577 index 19b5f5c1c0ff..82aa639e6737 100644
 578 --- a/arch/arm/kvm/arm.c
 579 +++ b/arch/arm/kvm/arm.c
 580 @@ -619,7 +619,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 581                  * involves poking the GIC, which must be done in a
 582                  * non-preemptible context.
 583                  */
 584 -               preempt_disable();
 585 +               migrate_disable();
 586                 kvm_pmu_flush_hwstate(vcpu);
 587                 kvm_timer_flush_hwstate(vcpu);
 588                 kvm_vgic_flush_hwstate(vcpu);
 589 @@ -640,7 +640,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 590                         kvm_pmu_sync_hwstate(vcpu);
 591                         kvm_timer_sync_hwstate(vcpu);
 592                         kvm_vgic_sync_hwstate(vcpu);
 593 -                       preempt_enable();
 594 +                       migrate_enable();
 595                         continue;
 596                 }
 597
 598 @@ -696,7 +696,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 599
 600                 kvm_vgic_sync_hwstate(vcpu);
 601
 602 -               preempt_enable();
 603 +               migrate_enable();
 604
 605                 ret = handle_exit(vcpu, run, ret);
 606         }
 607 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
 608 index 98ffe1e62ad5..df9769ddece5 100644
 609 --- a/arch/arm/mach-exynos/platsmp.c
 610 +++ b/arch/arm/mach-exynos/platsmp.c
 611 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
 612         return (void __iomem *)(S5P_VA_SCU);
 613  }
 614
 615 -static DEFINE_SPINLOCK(boot_lock);
 616 +static DEFINE_RAW_SPINLOCK(boot_lock);
 617
 618  static void exynos_secondary_init(unsigned int cpu)
 619  {
 620 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
 621         /*
 622          * Synchronise with the boot thread.
 623          */
 624 -       spin_lock(&boot_lock);
 625 -       spin_unlock(&boot_lock);
 626 +       raw_spin_lock(&boot_lock);
 627 +       raw_spin_unlock(&boot_lock);
 628  }
 629
 630  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 631 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 632          * Set synchronisation state between this boot processor
 633          * and the secondary one
 634          */
 635 -       spin_lock(&boot_lock);
 636 +       raw_spin_lock(&boot_lock);
 637
 638         /*
 639          * The secondary processor is waiting to be released from
 640 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 641
 642                 if (timeout == 0) {
 643                         printk(KERN_ERR "cpu1 power enable failed");
 644 -                       spin_unlock(&boot_lock);
 645 +                       raw_spin_unlock(&boot_lock);
 646                         return -ETIMEDOUT;
 647                 }
 648         }
 649 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 650          * calibrations, then wait for it to finish
 651          */
 652  fail:
 653 -       spin_unlock(&boot_lock);
 654 +       raw_spin_unlock(&boot_lock);
 655
 656         return pen_release != -1 ? ret : 0;
 657  }
 658 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
 659 index 4b653a8cb75c..b03d5a922cb1 100644
 660 --- a/arch/arm/mach-hisi/platmcpm.c
 661 +++ b/arch/arm/mach-hisi/platmcpm.c
 662 @@ -61,7 +61,7 @@
 663
 664  static void __iomem *sysctrl, *fabric;
 665  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
 666 -static DEFINE_SPINLOCK(boot_lock);
 667 +static DEFINE_RAW_SPINLOCK(boot_lock);
 668  static u32 fabric_phys_addr;
 669  /*
 670   * [0]: bootwrapper physical address
 671 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 672         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
 673                 return -EINVAL;
 674
 675 -       spin_lock_irq(&boot_lock);
 676 +       raw_spin_lock_irq(&boot_lock);
 677
 678         if (hip04_cpu_table[cluster][cpu])
 679                 goto out;
 680 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 681
 682  out:
 683         hip04_cpu_table[cluster][cpu]++;
 684 -       spin_unlock_irq(&boot_lock);
 685 +       raw_spin_unlock_irq(&boot_lock);
 686
 687         return 0;
 688  }
 689 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
 690         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 691         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 692
 693 -       spin_lock(&boot_lock);
 694 +       raw_spin_lock(&boot_lock);
 695         hip04_cpu_table[cluster][cpu]--;
 696         if (hip04_cpu_table[cluster][cpu] == 1) {
 697                 /* A power_up request went ahead of us. */
 698 -               spin_unlock(&boot_lock);
 699 +               raw_spin_unlock(&boot_lock);
 700                 return;
 701         } else if (hip04_cpu_table[cluster][cpu] > 1) {
 702                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
 703 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
 704         }
 705
 706         last_man = hip04_cluster_is_down(cluster);
 707 -       spin_unlock(&boot_lock);
 708 +       raw_spin_unlock(&boot_lock);
 709         if (last_man) {
 710                 /* Since it's Cortex A15, disable L2 prefetching. */
 711                 asm volatile(
 712 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 713                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 714
 715         count = TIMEOUT_MSEC / POLL_MSEC;
 716 -       spin_lock_irq(&boot_lock);
 717 +       raw_spin_lock_irq(&boot_lock);
 718         for (tries = 0; tries < count; tries++) {
 719                 if (hip04_cpu_table[cluster][cpu])
 720                         goto err;
 721 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 722                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 723                 if (data & CORE_WFI_STATUS(cpu))
 724                         break;
 725 -               spin_unlock_irq(&boot_lock);
 726 +               raw_spin_unlock_irq(&boot_lock);
 727                 /* Wait for clean L2 when the whole cluster is down. */
 728                 msleep(POLL_MSEC);
 729 -               spin_lock_irq(&boot_lock);
 730 +               raw_spin_lock_irq(&boot_lock);
 731         }
 732         if (tries >= count)
 733                 goto err;
 734 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 735                 goto err;
 736         if (hip04_cluster_is_down(cluster))
 737                 hip04_set_snoop_filter(cluster, 0);
 738 -       spin_unlock_irq(&boot_lock);
 739 +       raw_spin_unlock_irq(&boot_lock);
 740         return 1;
 741  err:
 742 -       spin_unlock_irq(&boot_lock);
 743 +       raw_spin_unlock_irq(&boot_lock);
 744         return 0;
 745  }
 746  #endif
 747 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
 748 index b4de3da6dffa..b52893319d75 100644
 749 --- a/arch/arm/mach-omap2/omap-smp.c
 750 +++ b/arch/arm/mach-omap2/omap-smp.c
 751 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
 752         .startup_addr = omap5_secondary_startup,
 753  };
 754
 755 -static DEFINE_SPINLOCK(boot_lock);
 756 +static DEFINE_RAW_SPINLOCK(boot_lock);
 757
 758  void __iomem *omap4_get_scu_base(void)
 759  {
 760 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
 761         /*
 762          * Synchronise with the boot thread.
 763          */
 764 -       spin_lock(&boot_lock);
 765 -       spin_unlock(&boot_lock);
 766 +       raw_spin_lock(&boot_lock);
 767 +       raw_spin_unlock(&boot_lock);
 768  }
 769
 770  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 771 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 772          * Set synchronisation state between this boot processor
 773          * and the secondary one
 774          */
 775 -       spin_lock(&boot_lock);
 776 +       raw_spin_lock(&boot_lock);
 777
 778         /*
 779          * Update the AuxCoreBoot0 with boot state for secondary core.
 780 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 781          * Now the secondary core is starting up let it run its
 782          * calibrations, then wait for it to finish
 783          */
 784 -       spin_unlock(&boot_lock);
 785 +       raw_spin_unlock(&boot_lock);
 786
 787         return 0;
 788  }
 789 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
 790 index 0875b99add18..18b6d98d2581 100644
 791 --- a/arch/arm/mach-prima2/platsmp.c
 792 +++ b/arch/arm/mach-prima2/platsmp.c
 793 @@ -22,7 +22,7 @@
 794
 795  static void __iomem *clk_base;
 796
 797 -static DEFINE_SPINLOCK(boot_lock);
 798 +static DEFINE_RAW_SPINLOCK(boot_lock);
 799
 800  static void sirfsoc_secondary_init(unsigned int cpu)
 801  {
 802 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
 803         /*
 804          * Synchronise with the boot thread.
 805          */
 806 -       spin_lock(&boot_lock);
 807 -       spin_unlock(&boot_lock);
 808 +       raw_spin_lock(&boot_lock);
 809 +       raw_spin_unlock(&boot_lock);
 810  }
 811
 812  static const struct of_device_id clk_ids[]  = {
 813 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 814         /* make sure write buffer is drained */
 815         mb();
 816
 817 -       spin_lock(&boot_lock);
 818 +       raw_spin_lock(&boot_lock);
 819
 820         /*
 821          * The secondary processor is waiting to be released from
 822 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 823          * now the secondary core is starting up let it run its
 824          * calibrations, then wait for it to finish
 825          */
 826 -       spin_unlock(&boot_lock);
 827 +       raw_spin_unlock(&boot_lock);
 828
 829         return pen_release != -1 ? -ENOSYS : 0;
 830  }
 831 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
 832 index 5494c9e0c909..e8ce157d3548 100644
 833 --- a/arch/arm/mach-qcom/platsmp.c
 834 +++ b/arch/arm/mach-qcom/platsmp.c
 835 @@ -46,7 +46,7 @@
 836
 837  extern void secondary_startup_arm(void);
 838
 839 -static DEFINE_SPINLOCK(boot_lock);
 840 +static DEFINE_RAW_SPINLOCK(boot_lock);
 841
 842  #ifdef CONFIG_HOTPLUG_CPU
 843  static void qcom_cpu_die(unsigned int cpu)
 844 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
 845         /*
 846          * Synchronise with the boot thread.
 847          */
 848 -       spin_lock(&boot_lock);
 849 -       spin_unlock(&boot_lock);
 850 +       raw_spin_lock(&boot_lock);
 851 +       raw_spin_unlock(&boot_lock);
 852  }
 853
 854  static int scss_release_secondary(unsigned int cpu)
 855 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 856          * set synchronisation state between this boot processor
 857          * and the secondary one
 858          */
 859 -       spin_lock(&boot_lock);
 860 +       raw_spin_lock(&boot_lock);
 861
 862         /*
 863          * Send the secondary CPU a soft interrupt, thereby causing
 864 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 865          * now the secondary core is starting up let it run its
 866          * calibrations, then wait for it to finish
 867          */
 868 -       spin_unlock(&boot_lock);
 869 +       raw_spin_unlock(&boot_lock);
 870
 871         return ret;
 872  }
 873 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
 874 index 8d1e2d551786..7fa56cc78118 100644
 875 --- a/arch/arm/mach-spear/platsmp.c
 876 +++ b/arch/arm/mach-spear/platsmp.c
 877 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
 878         sync_cache_w(&pen_release);
 879  }
 880
 881 -static DEFINE_SPINLOCK(boot_lock);
 882 +static DEFINE_RAW_SPINLOCK(boot_lock);
 883
 884  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 885
 886 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
 887         /*
 888          * Synchronise with the boot thread.
 889          */
 890 -       spin_lock(&boot_lock);
 891 -       spin_unlock(&boot_lock);
 892 +       raw_spin_lock(&boot_lock);
 893 +       raw_spin_unlock(&boot_lock);
 894  }
 895
 896  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 897 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 898          * set synchronisation state between this boot processor
 899          * and the secondary one
 900          */
 901 -       spin_lock(&boot_lock);
 902 +       raw_spin_lock(&boot_lock);
 903
 904         /*
 905          * The secondary processor is waiting to be released from
 906 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 907          * now the secondary core is starting up let it run its
 908          * calibrations, then wait for it to finish
 909          */
 910 -       spin_unlock(&boot_lock);
 911 +       raw_spin_unlock(&boot_lock);
 912
 913         return pen_release != -1 ? -ENOSYS : 0;
 914  }
 915 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
 916 index ea5a2277ee46..b988e081ac79 100644
 917 --- a/arch/arm/mach-sti/platsmp.c
 918 +++ b/arch/arm/mach-sti/platsmp.c
 919 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
 920         sync_cache_w(&pen_release);
 921  }
 922
 923 -static DEFINE_SPINLOCK(boot_lock);
 924 +static DEFINE_RAW_SPINLOCK(boot_lock);
 925
 926  static void sti_secondary_init(unsigned int cpu)
 927  {
 928 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
 929         /*
 930          * Synchronise with the boot thread.
 931          */
 932 -       spin_lock(&boot_lock);
 933 -       spin_unlock(&boot_lock);
 934 +       raw_spin_lock(&boot_lock);
 935 +       raw_spin_unlock(&boot_lock);
 936  }
 937
 938  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 939 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 940          * set synchronisation state between this boot processor
 941          * and the secondary one
 942          */
 943 -       spin_lock(&boot_lock);
 944 +       raw_spin_lock(&boot_lock);
 945
 946         /*
 947          * The secondary processor is waiting to be released from
 948 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 949          * now the secondary core is starting up let it run its
 950          * calibrations, then wait for it to finish
 951          */
 952 -       spin_unlock(&boot_lock);
 953 +       raw_spin_unlock(&boot_lock);
 954
 955         return pen_release != -1 ? -ENOSYS : 0;
 956  }
 957 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
 958 index 3a2e678b8d30..3ed1e9ba6a01 100644
 959 --- a/arch/arm/mm/fault.c
 960 +++ b/arch/arm/mm/fault.c
 961 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
 962         if (addr < TASK_SIZE)
 963                 return do_page_fault(addr, fsr, regs);
 964
 965 +       if (interrupts_enabled(regs))
 966 +               local_irq_enable();
 967 +
 968         if (user_mode(regs))
 969                 goto bad_area;
 970
 971 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
 972  static int
 973  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 974  {
 975 +       if (interrupts_enabled(regs))
 976 +               local_irq_enable();
 977 +
 978         do_bad_area(addr, fsr, regs);
 979         return 0;
 980  }
 981 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
 982 index d02f8187b1cc..542692dbd40a 100644
 983 --- a/arch/arm/mm/highmem.c
 984 +++ b/arch/arm/mm/highmem.c
 985 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
 986         return *ptep;
 987  }
 988
 989 +static unsigned int fixmap_idx(int type)
 990 +{
 991 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
 992 +}
 993 +
 994  void *kmap(struct page *page)
 995  {
 996         might_sleep();
 997 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
 998
 999  void *kmap_atomic(struct page *page)
1000  {
1001 +       pte_t pte = mk_pte(page, kmap_prot);
1002         unsigned int idx;
1003         unsigned long vaddr;
1004         void *kmap;
1005         int type;
1006
1007 -       preempt_disable();
1008 +       preempt_disable_nort();
1009         pagefault_disable();
1010         if (!PageHighMem(page))
1011                 return page_address(page);
1012 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1013
1014         type = kmap_atomic_idx_push();
1015
1016 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1017 +       idx = fixmap_idx(type);
1018         vaddr = __fix_to_virt(idx);
1019  #ifdef CONFIG_DEBUG_HIGHMEM
1020         /*
1021 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1022          * in place, so the contained TLB flush ensures the TLB is updated
1023          * with the new mapping.
1024          */
1025 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1026 +#ifdef CONFIG_PREEMPT_RT_FULL
1027 +       current->kmap_pte[type] = pte;
1028 +#endif
1029 +       set_fixmap_pte(idx, pte);
1030
1031         return (void *)vaddr;
1032  }
1033 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1034
1035         if (kvaddr >= (void *)FIXADDR_START) {
1036                 type = kmap_atomic_idx();
1037 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1038 +               idx = fixmap_idx(type);
1039
1040                 if (cache_is_vivt())
1041                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1042 +#ifdef CONFIG_PREEMPT_RT_FULL
1043 +               current->kmap_pte[type] = __pte(0);
1044 +#endif
1045  #ifdef CONFIG_DEBUG_HIGHMEM
1046                 BUG_ON(vaddr != __fix_to_virt(idx));
1047 -               set_fixmap_pte(idx, __pte(0));
1048  #else
1049                 (void) idx;  /* to kill a warning */
1050  #endif
1051 +               set_fixmap_pte(idx, __pte(0));
1052                 kmap_atomic_idx_pop();
1053         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1054                 /* this address was obtained through kmap_high_get() */
1055                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1056         }
1057         pagefault_enable();
1058 -       preempt_enable();
1059 +       preempt_enable_nort();
1060  }
1061  EXPORT_SYMBOL(__kunmap_atomic);
1062
1063  void *kmap_atomic_pfn(unsigned long pfn)
1064  {
1065 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1066         unsigned long vaddr;
1067         int idx, type;
1068         struct page *page = pfn_to_page(pfn);
1069
1070 -       preempt_disable();
1071 +       preempt_disable_nort();
1072         pagefault_disable();
1073         if (!PageHighMem(page))
1074                 return page_address(page);
1075
1076         type = kmap_atomic_idx_push();
1077 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1078 +       idx = fixmap_idx(type);
1079         vaddr = __fix_to_virt(idx);
1080  #ifdef CONFIG_DEBUG_HIGHMEM
1081         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1082  #endif
1083 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1084 +#ifdef CONFIG_PREEMPT_RT_FULL
1085 +       current->kmap_pte[type] = pte;
1086 +#endif
1087 +       set_fixmap_pte(idx, pte);
1088
1089         return (void *)vaddr;
1090  }
1091 +#if defined CONFIG_PREEMPT_RT_FULL
1092 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1093 +{
1094 +       int i;
1095 +
1096 +       /*
1097 +        * Clear @prev's kmap_atomic mappings
1098 +        */
1099 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1100 +               int idx = fixmap_idx(i);
1101 +
1102 +               set_fixmap_pte(idx, __pte(0));
1103 +       }
1104 +       /*
1105 +        * Restore @next_p's kmap_atomic mappings
1106 +        */
1107 +       for (i = 0; i < next_p->kmap_idx; i++) {
1108 +               int idx = fixmap_idx(i);
1109 +
1110 +               if (!pte_none(next_p->kmap_pte[i]))
1111 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1112 +       }
1113 +}
1114 +#endif
1115 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1116 index c2366510187a..6b60f582b738 100644
1117 --- a/arch/arm/plat-versatile/platsmp.c
1118 +++ b/arch/arm/plat-versatile/platsmp.c
1119 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1120         sync_cache_w(&pen_release);
1121  }
1122
1123 -static DEFINE_SPINLOCK(boot_lock);
1124 +static DEFINE_RAW_SPINLOCK(boot_lock);
1125
1126  void versatile_secondary_init(unsigned int cpu)
1127  {
1128 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1129         /*
1130          * Synchronise with the boot thread.
1131          */
1132 -       spin_lock(&boot_lock);
1133 -       spin_unlock(&boot_lock);
1134 +       raw_spin_lock(&boot_lock);
1135 +       raw_spin_unlock(&boot_lock);
1136  }
1137
1138  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1139 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1140          * Set synchronisation state between this boot processor
1141          * and the secondary one
1142          */
1143 -       spin_lock(&boot_lock);
1144 +       raw_spin_lock(&boot_lock);
1145
1146         /*
1147          * This is really belt and braces; we hold unintended secondary
1148 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1149          * now the secondary core is starting up let it run its
1150          * calibrations, then wait for it to finish
1151          */
1152 -       spin_unlock(&boot_lock);
1153 +       raw_spin_unlock(&boot_lock);
1154
1155         return pen_release != -1 ? -ENOSYS : 0;
1156  }
1157 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1158 index 969ef880d234..1182fe883771 100644
1159 --- a/arch/arm64/Kconfig
1160 +++ b/arch/arm64/Kconfig
1161 @@ -91,6 +91,7 @@ config ARM64
1162         select HAVE_PERF_EVENTS
1163         select HAVE_PERF_REGS
1164         select HAVE_PERF_USER_STACK_DUMP
1165 +       select HAVE_PREEMPT_LAZY
1166         select HAVE_REGS_AND_STACK_ACCESS_API
1167         select HAVE_RCU_TABLE_FREE
1168         select HAVE_SYSCALL_TRACEPOINTS
1169 @@ -694,7 +695,7 @@ config XEN_DOM0
1170
1171  config XEN
1172         bool "Xen guest support on ARM64"
1173 -       depends on ARM64 && OF
1174 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1175         select SWIOTLB_XEN
1176         select PARAVIRT
1177         help
1178 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1179 index e9ea5a6bd449..6c500ad63c6a 100644
1180 --- a/arch/arm64/include/asm/thread_info.h
1181 +++ b/arch/arm64/include/asm/thread_info.h
1182 @@ -49,6 +49,7 @@ struct thread_info {
1183         mm_segment_t            addr_limit;     /* address limit */
1184         struct task_struct      *task;          /* main task structure */
1185         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1186 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1187         int                     cpu;            /* cpu */
1188  };
1189
1190 @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
1191  #define TIF_NEED_RESCHED       1
1192  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1193  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1194 +#define TIF_NEED_RESCHED_LAZY  4
1195  #define TIF_NOHZ               7
1196  #define TIF_SYSCALL_TRACE      8
1197  #define TIF_SYSCALL_AUDIT      9
1198 @@ -127,6 +129,7 @@ static inline struct thread_info *current_thread_info(void)
1199  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1200  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1201  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1202 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1203  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1204  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1205  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1206 @@ -135,7 +138,9 @@ static inline struct thread_info *current_thread_info(void)
1207  #define _TIF_32BIT             (1 << TIF_32BIT)
1208
1209  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1210 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1211 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1212 +                                _TIF_NEED_RESCHED_LAZY)
1213 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1214
1215  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1216                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1217 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1218 index 4a2f0f0fef32..6bf2bc17c400 100644
1219 --- a/arch/arm64/kernel/asm-offsets.c
1220 +++ b/arch/arm64/kernel/asm-offsets.c
1221 @@ -38,6 +38,7 @@ int main(void)
1222    BLANK();
1223    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1224    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1225 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1226    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1227    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1228    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1229 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1230 index 79b0fe24d5b7..f3c959ade308 100644
1231 --- a/arch/arm64/kernel/entry.S
1232 +++ b/arch/arm64/kernel/entry.S
1233 @@ -428,11 +428,16 @@ ENDPROC(el1_sync)
1234
1235  #ifdef CONFIG_PREEMPT
1236         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1237 -       cbnz    w24, 1f                         // preempt count != 0
1238 +       cbnz    w24, 2f                         // preempt count != 0
1239         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1240 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1241 -       bl      el1_preempt
1242 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1243 +
1244 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1245 +       cbnz    w24, 2f                         // preempt lazy count != 0
1246 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1247  1:
1248 +       bl      el1_preempt
1249 +2:
1250  #endif
1251  #ifdef CONFIG_TRACE_IRQFLAGS
1252         bl      trace_hardirqs_on
1253 @@ -446,6 +451,7 @@ ENDPROC(el1_irq)
1254  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1255         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1256         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1257 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1258         ret     x24
1259  #endif
1260
1261 diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
1262 index 404dd67080b9..639dc6d12e72 100644
1263 --- a/arch/arm64/kernel/signal.c
1264 +++ b/arch/arm64/kernel/signal.c
1265 @@ -409,7 +409,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
1266          */
1267         trace_hardirqs_off();
1268         do {
1269 -               if (thread_flags & _TIF_NEED_RESCHED) {
1270 +               if (thread_flags & _TIF_NEED_RESCHED_MASK) {
1271                         schedule();
1272                 } else {
1273                         local_irq_enable();
1274 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1275 index b3c5bde43d34..8122bf058de0 100644
1276 --- a/arch/mips/Kconfig
1277 +++ b/arch/mips/Kconfig
1278 @@ -2514,7 +2514,7 @@ config MIPS_ASID_BITS_VARIABLE
1279  #
1280  config HIGHMEM
1281         bool "High Memory Support"
1282 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1283 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1284
1285  config CPU_SUPPORTS_HIGHMEM
1286         bool
1287 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1288 index 65fba4c34cd7..4b5ba68910e0 100644
1289 --- a/arch/powerpc/Kconfig
1290 +++ b/arch/powerpc/Kconfig
1291 @@ -52,10 +52,11 @@ config LOCKDEP_SUPPORT
1292
1293  config RWSEM_GENERIC_SPINLOCK
1294         bool
1295 +       default y if PREEMPT_RT_FULL
1296
1297  config RWSEM_XCHGADD_ALGORITHM
1298         bool
1299 -       default y
1300 +       default y if !PREEMPT_RT_FULL
1301
1302  config GENERIC_LOCKBREAK
1303         bool
1304 @@ -134,6 +135,7 @@ config PPC
1305         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1306         select GENERIC_STRNCPY_FROM_USER
1307         select GENERIC_STRNLEN_USER
1308 +       select HAVE_PREEMPT_LAZY
1309         select HAVE_MOD_ARCH_SPECIFIC
1310         select MODULES_USE_ELF_RELA
1311         select CLONE_BACKWARDS
1312 @@ -321,7 +323,7 @@ menu "Kernel options"
1313
1314  config HIGHMEM
1315         bool "High memory support"
1316 -       depends on PPC32
1317 +       depends on PPC32 && !PREEMPT_RT_FULL
1318
1319  source kernel/Kconfig.hz
1320  source kernel/Kconfig.preempt
1321 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1322 index 87e4b2d8dcd4..981e501a4359 100644
1323 --- a/arch/powerpc/include/asm/thread_info.h
1324 +++ b/arch/powerpc/include/asm/thread_info.h
1325 @@ -43,6 +43,8 @@ struct thread_info {
1326         int             cpu;                    /* cpu we're on */
1327         int             preempt_count;          /* 0 => preemptable,
1328                                                    <0 => BUG */
1329 +       int             preempt_lazy_count;     /* 0 => preemptable,
1330 +                                                  <0 => BUG */
1331         unsigned long   local_flags;            /* private flags for thread */
1332  #ifdef CONFIG_LIVEPATCH
1333         unsigned long *livepatch_sp;
1334 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1335  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1336  #define TIF_SIGPENDING         1       /* signal pending */
1337  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1338 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1339 -                                          TIF_NEED_RESCHED */
1340 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1341  #define TIF_32BIT              4       /* 32 bit binary */
1342  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1343  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1344 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1345  #if defined(CONFIG_PPC64)
1346  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1347  #endif
1348 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1349 +                                          TIF_NEED_RESCHED */
1350
1351  /* as above, but as bit values */
1352  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1353 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1354  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1355  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1356  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1357 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1358  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1359                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1360                                  _TIF_NOHZ)
1361
1362  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1363                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1364 -                                _TIF_RESTORE_TM)
1365 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1366  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1367 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1368
1369  /* Bits in local_flags */
1370  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1371 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1372 index c833d88c423d..96e9fbc3f684 100644
1373 --- a/arch/powerpc/kernel/asm-offsets.c
1374 +++ b/arch/powerpc/kernel/asm-offsets.c
1375 @@ -156,6 +156,7 @@ int main(void)
1376         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1377         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1378         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1379 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1380         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1381         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1382
1383 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1384 index 3841d749a430..6dbaeff192b9 100644
1385 --- a/arch/powerpc/kernel/entry_32.S
1386 +++ b/arch/powerpc/kernel/entry_32.S
1387 @@ -835,7 +835,14 @@ user_exc_return:           /* r10 contains MSR_KERNEL here */
1388         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1389         bne     restore
1390         andi.   r8,r8,_TIF_NEED_RESCHED
1391 +       bne+    1f
1392 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1393 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1394 +       bne     restore
1395 +       lwz     r0,TI_FLAGS(r9)
1396 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1397         beq+    restore
1398 +1:
1399         lwz     r3,_MSR(r1)
1400         andi.   r0,r3,MSR_EE    /* interrupts off? */
1401         beq     restore         /* don't schedule if so */
1402 @@ -846,11 +853,11 @@ user_exc_return:          /* r10 contains MSR_KERNEL here */
1403          */
1404         bl      trace_hardirqs_off
1405  #endif
1406 -1:     bl      preempt_schedule_irq
1407 +2:     bl      preempt_schedule_irq
1408         CURRENT_THREAD_INFO(r9, r1)
1409         lwz     r3,TI_FLAGS(r9)
1410 -       andi.   r0,r3,_TIF_NEED_RESCHED
1411 -       bne-    1b
1412 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1413 +       bne-    2b
1414  #ifdef CONFIG_TRACE_IRQFLAGS
1415         /* And now, to properly rebalance the above, we tell lockdep they
1416          * are being turned back on, which will happen when we return
1417 @@ -1171,7 +1178,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
1418  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1419
1420  do_work:                       /* r10 contains MSR_KERNEL here */
1421 -       andi.   r0,r9,_TIF_NEED_RESCHED
1422 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1423         beq     do_user_signal
1424
1425  do_resched:                    /* r10 contains MSR_KERNEL here */
1426 @@ -1192,7 +1199,7 @@ do_resched:                       /* r10 contains MSR_KERNEL here */
1427         MTMSRD(r10)             /* disable interrupts */
1428         CURRENT_THREAD_INFO(r9, r1)
1429         lwz     r9,TI_FLAGS(r9)
1430 -       andi.   r0,r9,_TIF_NEED_RESCHED
1431 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1432         bne-    do_resched
1433         andi.   r0,r9,_TIF_USER_WORK_MASK
1434         beq     restore_user
1435 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1436 index 6432d4bf08c8..5509a26f1070 100644
1437 --- a/arch/powerpc/kernel/entry_64.S
1438 +++ b/arch/powerpc/kernel/entry_64.S
1439 @@ -656,7 +656,7 @@ _GLOBAL(ret_from_except_lite)
1440         bl      restore_math
1441         b       restore
1442  #endif
1443 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1444 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1445         beq     2f
1446         bl      restore_interrupts
1447         SCHEDULE_USER
1448 @@ -718,10 +718,18 @@ _GLOBAL(ret_from_except_lite)
1449
1450  #ifdef CONFIG_PREEMPT
1451         /* Check if we need to preempt */
1452 -       andi.   r0,r4,_TIF_NEED_RESCHED
1453 -       beq+    restore
1454 -       /* Check that preempt_count() == 0 and interrupts are enabled */
1455         lwz     r8,TI_PREEMPT(r9)
1456 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1457 +       bne     restore
1458 +       andi.   r0,r4,_TIF_NEED_RESCHED
1459 +       bne+    check_count
1460 +
1461 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1462 +       beq+    restore
1463 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1464 +
1465 +       /* Check that preempt_count() == 0 and interrupts are enabled */
1466 +check_count:
1467         cmpwi   cr1,r8,0
1468         ld      r0,SOFTE(r1)
1469         cmpdi   r0,0
1470 @@ -738,7 +746,7 @@ _GLOBAL(ret_from_except_lite)
1471         /* Re-test flags and eventually loop */
1472         CURRENT_THREAD_INFO(r9, r1)
1473         ld      r4,TI_FLAGS(r9)
1474 -       andi.   r0,r4,_TIF_NEED_RESCHED
1475 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1476         bne     1b
1477
1478         /*
1479 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1480 index 3c05c311e35e..f83f6ac1274d 100644
1481 --- a/arch/powerpc/kernel/irq.c
1482 +++ b/arch/powerpc/kernel/irq.c
1483 @@ -638,6 +638,7 @@ void irq_ctx_init(void)
1484         }
1485  }
1486
1487 +#ifndef CONFIG_PREEMPT_RT_FULL
1488  void do_softirq_own_stack(void)
1489  {
1490         struct thread_info *curtp, *irqtp;
1491 @@ -655,6 +656,7 @@ void do_softirq_own_stack(void)
1492         if (irqtp->flags)
1493                 set_bits(irqtp->flags, &curtp->flags);
1494  }
1495 +#endif
1496
1497  irq_hw_number_t virq_to_hw(unsigned int virq)
1498  {
1499 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1500 index 030d72df5dd5..b471a709e100 100644
1501 --- a/arch/powerpc/kernel/misc_32.S
1502 +++ b/arch/powerpc/kernel/misc_32.S
1503 @@ -41,6 +41,7 @@
1504   * We store the saved ksp_limit in the unused part
1505   * of the STACK_FRAME_OVERHEAD
1506   */
1507 +#ifndef CONFIG_PREEMPT_RT_FULL
1508  _GLOBAL(call_do_softirq)
1509         mflr    r0
1510         stw     r0,4(r1)
1511 @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq)
1512         stw     r10,THREAD+KSP_LIMIT(r2)
1513         mtlr    r0
1514         blr
1515 +#endif
1516
1517  /*
1518   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1519 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1520 index 4f178671f230..39e7d84a3492 100644
1521 --- a/arch/powerpc/kernel/misc_64.S
1522 +++ b/arch/powerpc/kernel/misc_64.S
1523 @@ -31,6 +31,7 @@
1524
1525         .text
1526
1527 +#ifndef CONFIG_PREEMPT_RT_FULL
1528  _GLOBAL(call_do_softirq)
1529         mflr    r0
1530         std     r0,16(r1)
1531 @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq)
1532         ld      r0,16(r1)
1533         mtlr    r0
1534         blr
1535 +#endif
1536
1537  _GLOBAL(call_do_irq)
1538         mflr    r0
1539 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1540 index 029be26b5a17..9528089ea142 100644
1541 --- a/arch/powerpc/kvm/Kconfig
1542 +++ b/arch/powerpc/kvm/Kconfig
1543 @@ -175,6 +175,7 @@ config KVM_E500MC
1544  config KVM_MPIC
1545         bool "KVM in-kernel MPIC emulation"
1546         depends on KVM && E500
1547 +       depends on !PREEMPT_RT_FULL
1548         select HAVE_KVM_IRQCHIP
1549         select HAVE_KVM_IRQFD
1550         select HAVE_KVM_IRQ_ROUTING
1551 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1552 index e48462447ff0..2670cee66064 100644
1553 --- a/arch/powerpc/platforms/ps3/device-init.c
1554 +++ b/arch/powerpc/platforms/ps3/device-init.c
1555 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1556         }
1557         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1558
1559 -       res = wait_event_interruptible(dev->done.wait,
1560 +       res = swait_event_interruptible(dev->done.wait,
1561                                        dev->done.done || kthread_should_stop());
1562         if (kthread_should_stop())
1563                 res = -EINTR;
1564 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1565 index 6c0378c0b8b5..abd58b4dff97 100644
1566 --- a/arch/sh/kernel/irq.c
1567 +++ b/arch/sh/kernel/irq.c
1568 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1569         hardirq_ctx[cpu] = NULL;
1570  }
1571
1572 +#ifndef CONFIG_PREEMPT_RT_FULL
1573  void do_softirq_own_stack(void)
1574  {
1575         struct thread_info *curctx;
1576 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1577                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1578         );
1579  }
1580 +#endif
1581  #else
1582  static inline void handle_one_irq(unsigned int irq)
1583  {
1584 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1585 index 165ecdd24d22..b68a464a22be 100644
1586 --- a/arch/sparc/Kconfig
1587 +++ b/arch/sparc/Kconfig
1588 @@ -194,12 +194,10 @@ config NR_CPUS
1589  source kernel/Kconfig.hz
1590
1591  config RWSEM_GENERIC_SPINLOCK
1592 -       bool
1593 -       default y if SPARC32
1594 +       def_bool PREEMPT_RT_FULL
1595
1596  config RWSEM_XCHGADD_ALGORITHM
1597 -       bool
1598 -       default y if SPARC64
1599 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1600
1601  config GENERIC_HWEIGHT
1602         bool
1603 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1604 index 34a7930b76ef..773740521008 100644
1605 --- a/arch/sparc/kernel/irq_64.c
1606 +++ b/arch/sparc/kernel/irq_64.c
1607 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1608         set_irq_regs(old_regs);
1609  }
1610
1611 +#ifndef CONFIG_PREEMPT_RT_FULL
1612  void do_softirq_own_stack(void)
1613  {
1614         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1615 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1616         __asm__ __volatile__("mov %0, %%sp"
1617                              : : "r" (orig_sp));
1618  }
1619 +#endif
1620
1621  #ifdef CONFIG_HOTPLUG_CPU
1622  void fixup_irqs(void)
1623 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1624 index bada636d1065..f8a995c90c01 100644
1625 --- a/arch/x86/Kconfig
1626 +++ b/arch/x86/Kconfig
1627 @@ -17,6 +17,7 @@ config X86_64
1628  ### Arch settings
1629  config X86
1630         def_bool y
1631 +       select HAVE_PREEMPT_LAZY
1632         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1633         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1634         select ANON_INODES
1635 @@ -232,8 +233,11 @@ config ARCH_MAY_HAVE_PC_FDC
1636         def_bool y
1637         depends on ISA_DMA_API
1638
1639 +config RWSEM_GENERIC_SPINLOCK
1640 +       def_bool PREEMPT_RT_FULL
1641 +
1642  config RWSEM_XCHGADD_ALGORITHM
1643 -       def_bool y
1644 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1645
1646  config GENERIC_CALIBRATE_DELAY
1647         def_bool y
1648 @@ -897,7 +901,7 @@ config IOMMU_HELPER
1649  config MAXSMP
1650         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1651         depends on X86_64 && SMP && DEBUG_KERNEL
1652 -       select CPUMASK_OFFSTACK
1653 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1654         ---help---
1655           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1656           If unsure, say N.
1657 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1658 index aa8b0672f87a..2429414bfc71 100644
1659 --- a/arch/x86/crypto/aesni-intel_glue.c
1660 +++ b/arch/x86/crypto/aesni-intel_glue.c
1661 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1662         err = blkcipher_walk_virt(desc, &walk);
1663         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1664
1665 -       kernel_fpu_begin();
1666         while ((nbytes = walk.nbytes)) {
1667 +               kernel_fpu_begin();
1668                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1669 -                             nbytes & AES_BLOCK_MASK);
1670 +                               nbytes & AES_BLOCK_MASK);
1671 +               kernel_fpu_end();
1672                 nbytes &= AES_BLOCK_SIZE - 1;
1673                 err = blkcipher_walk_done(desc, &walk, nbytes);
1674         }
1675 -       kernel_fpu_end();
1676
1677         return err;
1678  }
1679 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1680         err = blkcipher_walk_virt(desc, &walk);
1681         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1682
1683 -       kernel_fpu_begin();
1684         while ((nbytes = walk.nbytes)) {
1685 +               kernel_fpu_begin();
1686                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1687                               nbytes & AES_BLOCK_MASK);
1688 +               kernel_fpu_end();
1689                 nbytes &= AES_BLOCK_SIZE - 1;
1690                 err = blkcipher_walk_done(desc, &walk, nbytes);
1691         }
1692 -       kernel_fpu_end();
1693
1694         return err;
1695  }
1696 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1697         err = blkcipher_walk_virt(desc, &walk);
1698         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1699
1700 -       kernel_fpu_begin();
1701         while ((nbytes = walk.nbytes)) {
1702 +               kernel_fpu_begin();
1703                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1704                               nbytes & AES_BLOCK_MASK, walk.iv);
1705 +               kernel_fpu_end();
1706                 nbytes &= AES_BLOCK_SIZE - 1;
1707                 err = blkcipher_walk_done(desc, &walk, nbytes);
1708         }
1709 -       kernel_fpu_end();
1710
1711         return err;
1712  }
1713 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1714         err = blkcipher_walk_virt(desc, &walk);
1715         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1716
1717 -       kernel_fpu_begin();
1718         while ((nbytes = walk.nbytes)) {
1719 +               kernel_fpu_begin();
1720                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1721                               nbytes & AES_BLOCK_MASK, walk.iv);
1722 +               kernel_fpu_end();
1723                 nbytes &= AES_BLOCK_SIZE - 1;
1724                 err = blkcipher_walk_done(desc, &walk, nbytes);
1725         }
1726 -       kernel_fpu_end();
1727
1728         return err;
1729  }
1730 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1731         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1732         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1733
1734 -       kernel_fpu_begin();
1735         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1736 +               kernel_fpu_begin();
1737                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1738                                       nbytes & AES_BLOCK_MASK, walk.iv);
1739 +               kernel_fpu_end();
1740                 nbytes &= AES_BLOCK_SIZE - 1;
1741                 err = blkcipher_walk_done(desc, &walk, nbytes);
1742         }
1743         if (walk.nbytes) {
1744 +               kernel_fpu_begin();
1745                 ctr_crypt_final(ctx, &walk);
1746 +               kernel_fpu_end();
1747                 err = blkcipher_walk_done(desc, &walk, 0);
1748         }
1749 -       kernel_fpu_end();
1750
1751         return err;
1752  }
1753 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1754 index 8648158f3916..d7699130ee36 100644
1755 --- a/arch/x86/crypto/cast5_avx_glue.c
1756 +++ b/arch/x86/crypto/cast5_avx_glue.c
1757 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1758  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1759                      bool enc)
1760  {
1761 -       bool fpu_enabled = false;
1762 +       bool fpu_enabled;
1763         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1764         const unsigned int bsize = CAST5_BLOCK_SIZE;
1765         unsigned int nbytes;
1766 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1767                 u8 *wsrc = walk->src.virt.addr;
1768                 u8 *wdst = walk->dst.virt.addr;
1769
1770 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1771 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1772
1773                 /* Process multi-block batch */
1774                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1775 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1776                 } while (nbytes >= bsize);
1777
1778  done:
1779 +               cast5_fpu_end(fpu_enabled);
1780                 err = blkcipher_walk_done(desc, walk, nbytes);
1781         }
1782 -
1783 -       cast5_fpu_end(fpu_enabled);
1784         return err;
1785  }
1786
1787 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1788  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1789                        struct scatterlist *src, unsigned int nbytes)
1790  {
1791 -       bool fpu_enabled = false;
1792 +       bool fpu_enabled;
1793         struct blkcipher_walk walk;
1794         int err;
1795
1796 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1797         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1798
1799         while ((nbytes = walk.nbytes)) {
1800 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1801 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1802                 nbytes = __cbc_decrypt(desc, &walk);
1803 +               cast5_fpu_end(fpu_enabled);
1804                 err = blkcipher_walk_done(desc, &walk, nbytes);
1805         }
1806 -
1807 -       cast5_fpu_end(fpu_enabled);
1808         return err;
1809  }
1810
1811 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1812  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1813                      struct scatterlist *src, unsigned int nbytes)
1814  {
1815 -       bool fpu_enabled = false;
1816 +       bool fpu_enabled;
1817         struct blkcipher_walk walk;
1818         int err;
1819
1820 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1821         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1822
1823         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1824 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1825 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1826                 nbytes = __ctr_crypt(desc, &walk);
1827 +               cast5_fpu_end(fpu_enabled);
1828                 err = blkcipher_walk_done(desc, &walk, nbytes);
1829         }
1830
1831 -       cast5_fpu_end(fpu_enabled);
1832 -
1833         if (walk.nbytes) {
1834                 ctr_crypt_final(desc, &walk);
1835                 err = blkcipher_walk_done(desc, &walk, 0);
1836 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1837 index 6a85598931b5..3a506ce7ed93 100644
1838 --- a/arch/x86/crypto/glue_helper.c
1839 +++ b/arch/x86/crypto/glue_helper.c
1840 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1841         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1842         const unsigned int bsize = 128 / 8;
1843         unsigned int nbytes, i, func_bytes;
1844 -       bool fpu_enabled = false;
1845 +       bool fpu_enabled;
1846         int err;
1847
1848         err = blkcipher_walk_virt(desc, walk);
1849 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1850                 u8 *wdst = walk->dst.virt.addr;
1851
1852                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1853 -                                            desc, fpu_enabled, nbytes);
1854 +                                            desc, false, nbytes);
1855
1856                 for (i = 0; i < gctx->num_funcs; i++) {
1857                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1858 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1859                 }
1860
1861  done:
1862 +               glue_fpu_end(fpu_enabled);
1863                 err = blkcipher_walk_done(desc, walk, nbytes);
1864         }
1865
1866 -       glue_fpu_end(fpu_enabled);
1867         return err;
1868  }
1869
1870 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1871                             struct scatterlist *src, unsigned int nbytes)
1872  {
1873         const unsigned int bsize = 128 / 8;
1874 -       bool fpu_enabled = false;
1875 +       bool fpu_enabled;
1876         struct blkcipher_walk walk;
1877         int err;
1878
1879 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1880
1881         while ((nbytes = walk.nbytes)) {
1882                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1883 -                                            desc, fpu_enabled, nbytes);
1884 +                                            desc, false, nbytes);
1885                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1886 +               glue_fpu_end(fpu_enabled);
1887                 err = blkcipher_walk_done(desc, &walk, nbytes);
1888         }
1889
1890 -       glue_fpu_end(fpu_enabled);
1891         return err;
1892  }
1893  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1894 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1895                           struct scatterlist *src, unsigned int nbytes)
1896  {
1897         const unsigned int bsize = 128 / 8;
1898 -       bool fpu_enabled = false;
1899 +       bool fpu_enabled;
1900         struct blkcipher_walk walk;
1901         int err;
1902
1903 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1904
1905         while ((nbytes = walk.nbytes) >= bsize) {
1906                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1907 -                                            desc, fpu_enabled, nbytes);
1908 +                                            desc, false, nbytes);
1909                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1910 +               glue_fpu_end(fpu_enabled);
1911                 err = blkcipher_walk_done(desc, &walk, nbytes);
1912         }
1913
1914 -       glue_fpu_end(fpu_enabled);
1915 -
1916         if (walk.nbytes) {
1917                 glue_ctr_crypt_final_128bit(
1918                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1919 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1920                           void *tweak_ctx, void *crypt_ctx)
1921  {
1922         const unsigned int bsize = 128 / 8;
1923 -       bool fpu_enabled = false;
1924 +       bool fpu_enabled;
1925         struct blkcipher_walk walk;
1926         int err;
1927
1928 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1929
1930         /* set minimum length to bsize, for tweak_fn */
1931         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1932 -                                    desc, fpu_enabled,
1933 +                                    desc, false,
1934                                      nbytes < bsize ? bsize : nbytes);
1935 -
1936         /* calculate first value of T */
1937         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1938 +       glue_fpu_end(fpu_enabled);
1939
1940         while (nbytes) {
1941 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1942 +                               desc, false, nbytes);
1943                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1944
1945 +               glue_fpu_end(fpu_enabled);
1946                 err = blkcipher_walk_done(desc, &walk, nbytes);
1947                 nbytes = walk.nbytes;
1948         }
1949 -
1950 -       glue_fpu_end(fpu_enabled);
1951 -
1952         return err;
1953  }
1954  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1955 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1956 index bdd9cc59d20f..56d01a339ba4 100644
1957 --- a/arch/x86/entry/common.c
1958 +++ b/arch/x86/entry/common.c
1959 @@ -129,7 +129,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1960
1961  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
1962         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
1963 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
1964 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
1965
1966  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1967  {
1968 @@ -145,9 +145,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
1969                 /* We have work to do. */
1970                 local_irq_enable();
1971
1972 -               if (cached_flags & _TIF_NEED_RESCHED)
1973 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
1974                         schedule();
1975
1976 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
1977 +               if (unlikely(current->forced_info.si_signo)) {
1978 +                       struct task_struct *t = current;
1979 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
1980 +                       t->forced_info.si_signo = 0;
1981 +               }
1982 +#endif
1983                 if (cached_flags & _TIF_UPROBE)
1984                         uprobe_notify_resume(regs);
1985
1986 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
1987 index edba8606b99a..4a3389535fc6 100644
1988 --- a/arch/x86/entry/entry_32.S
1989 +++ b/arch/x86/entry/entry_32.S
1990 @@ -308,8 +308,25 @@ END(ret_from_exception)
1991  ENTRY(resume_kernel)
1992         DISABLE_INTERRUPTS(CLBR_ANY)
1993  need_resched:
1994 +       # preempt count == 0 + NEED_RS set?
1995         cmpl    $0, PER_CPU_VAR(__preempt_count)
1996 +#ifndef CONFIG_PREEMPT_LAZY
1997         jnz     restore_all
1998 +#else
1999 +       jz test_int_off
2000 +
2001 +       # atleast preempt count == 0 ?
2002 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2003 +       jne restore_all
2004 +
2005 +       movl    PER_CPU_VAR(current_task), %ebp
2006 +       cmpl $0,TASK_TI_preempt_lazy_count(%ebp)        # non-zero preempt_lazy_count ?
2007 +       jnz restore_all
2008 +
2009 +       testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp)
2010 +       jz restore_all
2011 +test_int_off:
2012 +#endif
2013         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2014         jz      restore_all
2015         call    preempt_schedule_irq
2016 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2017 index ef766a358b37..28401f826ab1 100644
2018 --- a/arch/x86/entry/entry_64.S
2019 +++ b/arch/x86/entry/entry_64.S
2020 @@ -546,7 +546,23 @@ GLOBAL(retint_user)
2021         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2022         jnc     1f
2023  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2024 +#ifndef CONFIG_PREEMPT_LAZY
2025         jnz     1f
2026 +#else
2027 +       jz      do_preempt_schedule_irq
2028 +
2029 +       # atleast preempt count == 0 ?
2030 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2031 +       jnz     1f
2032 +
2033 +       movq    PER_CPU_VAR(current_task), %rcx
2034 +       cmpl    $0, TASK_TI_preempt_lazy_count(%rcx)
2035 +       jnz     1f
2036 +
2037 +       bt      $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx)
2038 +       jnc     1f
2039 +do_preempt_schedule_irq:
2040 +#endif
2041         call    preempt_schedule_irq
2042         jmp     0b
2043  1:
2044 @@ -894,6 +910,7 @@ EXPORT_SYMBOL(native_load_gs_index)
2045         jmp     2b
2046         .previous
2047
2048 +#ifndef CONFIG_PREEMPT_RT_FULL
2049  /* Call softirq on interrupt stack. Interrupts are off. */
2050  ENTRY(do_softirq_own_stack)
2051         pushq   %rbp
2052 @@ -906,6 +923,7 @@ ENTRY(do_softirq_own_stack)
2053         decl    PER_CPU_VAR(irq_count)
2054         ret
2055  END(do_softirq_own_stack)
2056 +#endif
2057
2058  #ifdef CONFIG_XEN
2059  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2060 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2061 index 17f218645701..11bd1b7ee6eb 100644
2062 --- a/arch/x86/include/asm/preempt.h
2063 +++ b/arch/x86/include/asm/preempt.h
2064 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2065   * a decrement which hits zero means we have no preempt_count and should
2066   * reschedule.
2067   */
2068 -static __always_inline bool __preempt_count_dec_and_test(void)
2069 +static __always_inline bool ____preempt_count_dec_and_test(void)
2070  {
2071         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2072  }
2073
2074 +static __always_inline bool __preempt_count_dec_and_test(void)
2075 +{
2076 +       if (____preempt_count_dec_and_test())
2077 +               return true;
2078 +#ifdef CONFIG_PREEMPT_LAZY
2079 +       if (current_thread_info()->preempt_lazy_count)
2080 +               return false;
2081 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2082 +#else
2083 +       return false;
2084 +#endif
2085 +}
2086 +
2087  /*
2088   * Returns true when we need to resched and can (barring IRQ state).
2089   */
2090  static __always_inline bool should_resched(int preempt_offset)
2091  {
2092 +#ifdef CONFIG_PREEMPT_LAZY
2093 +       u32 tmp;
2094 +
2095 +       tmp = raw_cpu_read_4(__preempt_count);
2096 +       if (tmp == preempt_offset)
2097 +               return true;
2098 +
2099 +       /* preempt count == 0 ? */
2100 +       tmp &= ~PREEMPT_NEED_RESCHED;
2101 +       if (tmp)
2102 +               return false;
2103 +       if (current_thread_info()->preempt_lazy_count)
2104 +               return false;
2105 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2106 +#else
2107         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2108 +#endif
2109  }
2110
2111  #ifdef CONFIG_PREEMPT
2112 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2113 index 8af22be0fe61..d1328789b759 100644
2114 --- a/arch/x86/include/asm/signal.h
2115 +++ b/arch/x86/include/asm/signal.h
2116 @@ -27,6 +27,19 @@ typedef struct {
2117  #define SA_IA32_ABI    0x02000000u
2118  #define SA_X32_ABI     0x01000000u
2119
2120 +/*
2121 + * Because some traps use the IST stack, we must keep preemption
2122 + * disabled while calling do_trap(), but do_trap() may call
2123 + * force_sig_info() which will grab the signal spin_locks for the
2124 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2125 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2126 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2127 + * trap.
2128 + */
2129 +#if defined(CONFIG_PREEMPT_RT_FULL)
2130 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2131 +#endif
2132 +
2133  #ifndef CONFIG_COMPAT
2134  typedef sigset_t compat_sigset_t;
2135  #endif
2136 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2137 index 58505f01962f..02fa39652cd6 100644
2138 --- a/arch/x86/include/asm/stackprotector.h
2139 +++ b/arch/x86/include/asm/stackprotector.h
2140 @@ -59,7 +59,7 @@
2141   */
2142  static __always_inline void boot_init_stack_canary(void)
2143  {
2144 -       u64 canary;
2145 +       u64 uninitialized_var(canary);
2146         u64 tsc;
2147
2148  #ifdef CONFIG_X86_64
2149 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2150          * of randomness. The TSC only matters for very early init,
2151          * there it already has some randomness on most systems. Later
2152          * on during the bootup the random pool has true entropy too.
2153 +        *
2154 +        * For preempt-rt we need to weaken the randomness a bit, as
2155 +        * we can't call into the random generator from atomic context
2156 +        * due to locking constraints. We just leave canary
2157 +        * uninitialized and use the TSC based randomness on top of it.
2158          */
2159 +#ifndef CONFIG_PREEMPT_RT_FULL
2160         get_random_bytes(&canary, sizeof(canary));
2161 +#endif
2162         tsc = rdtsc();
2163         canary += tsc + (tsc << 32UL);
2164
2165 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2166 index ad6f5eb07a95..5ceb3a1c2b1a 100644
2167 --- a/arch/x86/include/asm/thread_info.h
2168 +++ b/arch/x86/include/asm/thread_info.h
2169 @@ -54,11 +54,14 @@ struct task_struct;
2170
2171  struct thread_info {
2172         unsigned long           flags;          /* low level flags */
2173 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2174 +                                                          <0 => BUG */
2175  };
2176
2177  #define INIT_THREAD_INFO(tsk)                  \
2178  {                                              \
2179         .flags          = 0,                    \
2180 +       .preempt_lazy_count = 0,                \
2181  }
2182
2183  #define init_stack             (init_thread_union.stack)
2184 @@ -67,6 +70,10 @@ struct thread_info {
2185
2186  #include <asm/asm-offsets.h>
2187
2188 +#define GET_THREAD_INFO(reg) \
2189 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2190 +       _ASM_SUB $(THREAD_SIZE),reg ;
2191 +
2192  #endif
2193
2194  /*
2195 @@ -85,6 +92,7 @@ struct thread_info {
2196  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2197  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2198  #define TIF_SECCOMP            8       /* secure computing */
2199 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2200  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2201  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2202  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2203 @@ -108,6 +116,7 @@ struct thread_info {
2204  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2205  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2206  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2207 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2208  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2209  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2210  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2211 @@ -143,6 +152,8 @@ struct thread_info {
2212  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2213  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2214
2215 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2216 +
2217  #define STACK_WARN             (THREAD_SIZE/8)
2218
2219  /*
2220 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2221 index 57ab86d94d64..35d25e27180f 100644
2222 --- a/arch/x86/include/asm/uv/uv_bau.h
2223 +++ b/arch/x86/include/asm/uv/uv_bau.h
2224 @@ -624,9 +624,9 @@ struct bau_control {
2225         cycles_t                send_message;
2226         cycles_t                period_end;
2227         cycles_t                period_time;
2228 -       spinlock_t              uvhub_lock;
2229 -       spinlock_t              queue_lock;
2230 -       spinlock_t              disable_lock;
2231 +       raw_spinlock_t          uvhub_lock;
2232 +       raw_spinlock_t          queue_lock;
2233 +       raw_spinlock_t          disable_lock;
2234         /* tunables */
2235         int                     max_concurr;
2236         int                     max_concurr_const;
2237 @@ -815,15 +815,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2238   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2239   * on equal.
2240   */
2241 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2242 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2243  {
2244 -       spin_lock(lock);
2245 +       raw_spin_lock(lock);
2246         if (atomic_read(v) >= u) {
2247 -               spin_unlock(lock);
2248 +               raw_spin_unlock(lock);
2249                 return 0;
2250         }
2251         atomic_inc(v);
2252 -       spin_unlock(lock);
2253 +       raw_spin_unlock(lock);
2254         return 1;
2255  }
2256
2257 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2258 index 931ced8ca345..167975ac8af7 100644
2259 --- a/arch/x86/kernel/acpi/boot.c
2260 +++ b/arch/x86/kernel/acpi/boot.c
2261 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2262   *             ->ioapic_mutex
2263   *                     ->ioapic_lock
2264   */
2265 +#ifdef CONFIG_X86_IO_APIC
2266  static DEFINE_MUTEX(acpi_ioapic_lock);
2267 +#endif
2268
2269  /* --------------------------------------------------------------------------
2270                                Boot-time Configuration
2271 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2272 index 3d8ff40ecc6f..2e96d4e0295b 100644
2273 --- a/arch/x86/kernel/apic/io_apic.c
2274 +++ b/arch/x86/kernel/apic/io_apic.c
2275 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2276  static inline bool ioapic_irqd_mask(struct irq_data *data)
2277  {
2278         /* If we are moving the irq we need to mask it */
2279 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2280 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2281 +                    !irqd_irq_inprogress(data))) {
2282                 mask_ioapic_irq(data);
2283                 return true;
2284         }
2285 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2286 index c62e015b126c..0cc71257fca6 100644
2287 --- a/arch/x86/kernel/asm-offsets.c
2288 +++ b/arch/x86/kernel/asm-offsets.c
2289 @@ -36,6 +36,7 @@ void common(void) {
2290
2291         BLANK();
2292         OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
2293 +       OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count);
2294         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2295
2296         BLANK();
2297 @@ -91,4 +92,5 @@ void common(void) {
2298
2299         BLANK();
2300         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2301 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2302  }
2303 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2304 index a7fdf453d895..e3a0e969a66e 100644
2305 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2306 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2307 @@ -41,6 +41,8 @@
2308  #include <linux/debugfs.h>
2309  #include <linux/irq_work.h>
2310  #include <linux/export.h>
2311 +#include <linux/jiffies.h>
2312 +#include <linux/swork.h>
2313  #include <linux/jump_label.h>
2314
2315  #include <asm/processor.h>
2316 @@ -1317,7 +1319,7 @@ void mce_log_therm_throt_event(__u64 status)
2317  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2318
2319  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2320 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2321 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2322
2323  static unsigned long mce_adjust_timer_default(unsigned long interval)
2324  {
2325 @@ -1326,32 +1328,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2326
2327  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2328
2329 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2330 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2331  {
2332 -       unsigned long when = jiffies + interval;
2333 -       unsigned long flags;
2334 -
2335 -       local_irq_save(flags);
2336 -
2337 -       if (timer_pending(t)) {
2338 -               if (time_before(when, t->expires))
2339 -                       mod_timer(t, when);
2340 -       } else {
2341 -               t->expires = round_jiffies(when);
2342 -               add_timer_on(t, smp_processor_id());
2343 -       }
2344 -
2345 -       local_irq_restore(flags);
2346 +       if (!interval)
2347 +               return HRTIMER_NORESTART;
2348 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2349 +       return HRTIMER_RESTART;
2350  }
2351
2352 -static void mce_timer_fn(unsigned long data)
2353 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2354  {
2355 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2356 -       int cpu = smp_processor_id();
2357         unsigned long iv;
2358
2359 -       WARN_ON(cpu != data);
2360 -
2361         iv = __this_cpu_read(mce_next_interval);
2362
2363         if (mce_available(this_cpu_ptr(&cpu_info))) {
2364 @@ -1374,7 +1362,7 @@ static void mce_timer_fn(unsigned long data)
2365
2366  done:
2367         __this_cpu_write(mce_next_interval, iv);
2368 -       __restart_timer(t, iv);
2369 +       return __restart_timer(timer, iv);
2370  }
2371
2372  /*
2373 @@ -1382,7 +1370,7 @@ static void mce_timer_fn(unsigned long data)
2374   */
2375  void mce_timer_kick(unsigned long interval)
2376  {
2377 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2378 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2379         unsigned long iv = __this_cpu_read(mce_next_interval);
2380
2381         __restart_timer(t, interval);
2382 @@ -1397,7 +1385,7 @@ static void mce_timer_delete_all(void)
2383         int cpu;
2384
2385         for_each_online_cpu(cpu)
2386 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2387 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2388  }
2389
2390  static void mce_do_trigger(struct work_struct *work)
2391 @@ -1407,6 +1395,56 @@ static void mce_do_trigger(struct work_struct *work)
2392
2393  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2394
2395 +static void __mce_notify_work(struct swork_event *event)
2396 +{
2397 +       /* Not more than two messages every minute */
2398 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2399 +
2400 +       /* wake processes polling /dev/mcelog */
2401 +       wake_up_interruptible(&mce_chrdev_wait);
2402 +
2403 +       /*
2404 +        * There is no risk of missing notifications because
2405 +        * work_pending is always cleared before the function is
2406 +        * executed.
2407 +        */
2408 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2409 +               schedule_work(&mce_trigger_work);
2410 +
2411 +       if (__ratelimit(&ratelimit))
2412 +               pr_info(HW_ERR "Machine check events logged\n");
2413 +}
2414 +
2415 +#ifdef CONFIG_PREEMPT_RT_FULL
2416 +static bool notify_work_ready __read_mostly;
2417 +static struct swork_event notify_work;
2418 +
2419 +static int mce_notify_work_init(void)
2420 +{
2421 +       int err;
2422 +
2423 +       err = swork_get();
2424 +       if (err)
2425 +               return err;
2426 +
2427 +       INIT_SWORK(&notify_work, __mce_notify_work);
2428 +       notify_work_ready = true;
2429 +       return 0;
2430 +}
2431 +
2432 +static void mce_notify_work(void)
2433 +{
2434 +       if (notify_work_ready)
2435 +               swork_queue(&notify_work);
2436 +}
2437 +#else
2438 +static void mce_notify_work(void)
2439 +{
2440 +       __mce_notify_work(NULL);
2441 +}
2442 +static inline int mce_notify_work_init(void) { return 0; }
2443 +#endif
2444 +
2445  /*
2446   * Notify the user(s) about new machine check events.
2447   * Can be called from interrupt context, but not from machine check/NMI
2448 @@ -1414,19 +1452,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2449   */
2450  int mce_notify_irq(void)
2451  {
2452 -       /* Not more than two messages every minute */
2453 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2454 -
2455         if (test_and_clear_bit(0, &mce_need_notify)) {
2456 -               /* wake processes polling /dev/mcelog */
2457 -               wake_up_interruptible(&mce_chrdev_wait);
2458 -
2459 -               if (mce_helper[0])
2460 -                       schedule_work(&mce_trigger_work);
2461 -
2462 -               if (__ratelimit(&ratelimit))
2463 -                       pr_info(HW_ERR "Machine check events logged\n");
2464 -
2465 +               mce_notify_work();
2466                 return 1;
2467         }
2468         return 0;
2469 @@ -1732,7 +1759,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2470         }
2471  }
2472
2473 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2474 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2475  {
2476         unsigned long iv = check_interval * HZ;
2477
2478 @@ -1741,16 +1768,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2479
2480         per_cpu(mce_next_interval, cpu) = iv;
2481
2482 -       t->expires = round_jiffies(jiffies + iv);
2483 -       add_timer_on(t, cpu);
2484 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2485 +                       0, HRTIMER_MODE_REL_PINNED);
2486  }
2487
2488  static void __mcheck_cpu_init_timer(void)
2489  {
2490 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2491 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2492         unsigned int cpu = smp_processor_id();
2493
2494 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2495 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2496 +       t->function = mce_timer_fn;
2497         mce_start_timer(cpu, t);
2498  }
2499
2500 @@ -2475,6 +2503,8 @@ static void mce_disable_cpu(void *h)
2501         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2502                 return;
2503
2504 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2505 +
2506         if (!(action & CPU_TASKS_FROZEN))
2507                 cmci_clear();
2508
2509 @@ -2497,6 +2527,7 @@ static void mce_reenable_cpu(void *h)
2510                 if (b->init)
2511                         wrmsrl(msr_ops.ctl(i), b->ctl);
2512         }
2513 +       __mcheck_cpu_init_timer();
2514  }
2515
2516  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2517 @@ -2504,7 +2535,6 @@ static int
2518  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2519  {
2520         unsigned int cpu = (unsigned long)hcpu;
2521 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2522
2523         switch (action & ~CPU_TASKS_FROZEN) {
2524         case CPU_ONLINE:
2525 @@ -2524,11 +2554,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2526                 break;
2527         case CPU_DOWN_PREPARE:
2528                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2529 -               del_timer_sync(t);
2530                 break;
2531         case CPU_DOWN_FAILED:
2532                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2533 -               mce_start_timer(cpu, t);
2534                 break;
2535         }
2536
2537 @@ -2567,6 +2595,10 @@ static __init int mcheck_init_device(void)
2538                 goto err_out;
2539         }
2540
2541 +       err = mce_notify_work_init();
2542 +       if (err)
2543 +               goto err_out;
2544 +
2545         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2546                 err = -ENOMEM;
2547                 goto err_out;
2548 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2549 index 1f38d9a4d9de..053bf3b2ef39 100644
2550 --- a/arch/x86/kernel/irq_32.c
2551 +++ b/arch/x86/kernel/irq_32.c
2552 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2553                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2554  }
2555
2556 +#ifndef CONFIG_PREEMPT_RT_FULL
2557  void do_softirq_own_stack(void)
2558  {
2559         struct irq_stack *irqstk;
2560 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2561
2562         call_on_stack(__do_softirq, isp);
2563  }
2564 +#endif
2565
2566  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2567  {
2568 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2569 index bd7be8efdc4c..b3b0a7f7b1ca 100644
2570 --- a/arch/x86/kernel/process_32.c
2571 +++ b/arch/x86/kernel/process_32.c
2572 @@ -35,6 +35,7 @@
2573  #include <linux/uaccess.h>
2574  #include <linux/io.h>
2575  #include <linux/kdebug.h>
2576 +#include <linux/highmem.h>
2577
2578  #include <asm/pgtable.h>
2579  #include <asm/ldt.h>
2580 @@ -195,6 +196,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2581  }
2582  EXPORT_SYMBOL_GPL(start_thread);
2583
2584 +#ifdef CONFIG_PREEMPT_RT_FULL
2585 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2586 +{
2587 +       int i;
2588 +
2589 +       /*
2590 +        * Clear @prev's kmap_atomic mappings
2591 +        */
2592 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2593 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2594 +               pte_t *ptep = kmap_pte - idx;
2595 +
2596 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2597 +       }
2598 +       /*
2599 +        * Restore @next_p's kmap_atomic mappings
2600 +        */
2601 +       for (i = 0; i < next_p->kmap_idx; i++) {
2602 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2603 +
2604 +               if (!pte_none(next_p->kmap_pte[i]))
2605 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2606 +       }
2607 +}
2608 +#else
2609 +static inline void
2610 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2611 +#endif
2612 +
2613
2614  /*
2615   *     switch_to(x,y) should switch tasks from x to y.
2616 @@ -271,6 +301,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2617                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2618                 __switch_to_xtra(prev_p, next_p, tss);
2619
2620 +       switch_kmaps(prev_p, next_p);
2621 +
2622         /*
2623          * Leave lazy mode, flushing any hypercalls made here.
2624          * This must be done before restoring TLS segments so
2625 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2626 index 3f05c044720b..fe68afd37162 100644
2627 --- a/arch/x86/kvm/lapic.c
2628 +++ b/arch/x86/kvm/lapic.c
2629 @@ -1939,6 +1939,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2630         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2631                      HRTIMER_MODE_ABS_PINNED);
2632         apic->lapic_timer.timer.function = apic_timer_fn;
2633 +       apic->lapic_timer.timer.irqsafe = 1;
2634
2635         /*
2636          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2637 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2638 index 487b957e7802..a144b8cb358b 100644
2639 --- a/arch/x86/kvm/x86.c
2640 +++ b/arch/x86/kvm/x86.c
2641 @@ -5932,6 +5932,13 @@ int kvm_arch_init(void *opaque)
2642                 goto out;
2643         }
2644
2645 +#ifdef CONFIG_PREEMPT_RT_FULL
2646 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2647 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2648 +               return -EOPNOTSUPP;
2649 +       }
2650 +#endif
2651 +
2652         r = kvm_mmu_module_init();
2653         if (r)
2654                 goto out_free_percpu;
2655 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2656 index 6d18b70ed5a9..f752724c22e8 100644
2657 --- a/arch/x86/mm/highmem_32.c
2658 +++ b/arch/x86/mm/highmem_32.c
2659 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2660   */
2661  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2662  {
2663 +       pte_t pte = mk_pte(page, prot);
2664         unsigned long vaddr;
2665         int idx, type;
2666
2667 -       preempt_disable();
2668 +       preempt_disable_nort();
2669         pagefault_disable();
2670
2671         if (!PageHighMem(page))
2672 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2673         idx = type + KM_TYPE_NR*smp_processor_id();
2674         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2675         BUG_ON(!pte_none(*(kmap_pte-idx)));
2676 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2677 +#ifdef CONFIG_PREEMPT_RT_FULL
2678 +       current->kmap_pte[type] = pte;
2679 +#endif
2680 +       set_pte(kmap_pte-idx, pte);
2681         arch_flush_lazy_mmu_mode();
2682
2683         return (void *)vaddr;
2684 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2685                  * is a bad idea also, in case the page changes cacheability
2686                  * attributes or becomes a protected page in a hypervisor.
2687                  */
2688 +#ifdef CONFIG_PREEMPT_RT_FULL
2689 +               current->kmap_pte[type] = __pte(0);
2690 +#endif
2691                 kpte_clear_flush(kmap_pte-idx, vaddr);
2692                 kmap_atomic_idx_pop();
2693                 arch_flush_lazy_mmu_mode();
2694 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2695  #endif
2696
2697         pagefault_enable();
2698 -       preempt_enable();
2699 +       preempt_enable_nort();
2700  }
2701  EXPORT_SYMBOL(__kunmap_atomic);
2702
2703 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2704 index ada98b39b8ad..585f6829653b 100644
2705 --- a/arch/x86/mm/iomap_32.c
2706 +++ b/arch/x86/mm/iomap_32.c
2707 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2708
2709  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2710  {
2711 +       pte_t pte = pfn_pte(pfn, prot);
2712         unsigned long vaddr;
2713         int idx, type;
2714
2715 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2716         type = kmap_atomic_idx_push();
2717         idx = type + KM_TYPE_NR * smp_processor_id();
2718         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2719 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2720 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2721 +
2722 +#ifdef CONFIG_PREEMPT_RT_FULL
2723 +       current->kmap_pte[type] = pte;
2724 +#endif
2725 +       set_pte(kmap_pte - idx, pte);
2726         arch_flush_lazy_mmu_mode();
2727
2728         return (void *)vaddr;
2729 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2730                  * is a bad idea also, in case the page changes cacheability
2731                  * attributes or becomes a protected page in a hypervisor.
2732                  */
2733 +#ifdef CONFIG_PREEMPT_RT_FULL
2734 +               current->kmap_pte[type] = __pte(0);
2735 +#endif
2736                 kpte_clear_flush(kmap_pte-idx, vaddr);
2737                 kmap_atomic_idx_pop();
2738         }
2739 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2740 index e3353c97d086..01664968555c 100644
2741 --- a/arch/x86/mm/pageattr.c
2742 +++ b/arch/x86/mm/pageattr.c
2743 @@ -214,7 +214,15 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
2744                             int in_flags, struct page **pages)
2745  {
2746         unsigned int i, level;
2747 +#ifdef CONFIG_PREEMPT
2748 +       /*
2749 +        * Avoid wbinvd() because it causes latencies on all CPUs,
2750 +        * regardless of any CPU isolation that may be in effect.
2751 +        */
2752 +       unsigned long do_wbinvd = 0;
2753 +#else
2754         unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
2755 +#endif
2756
2757         BUG_ON(irqs_disabled());
2758
2759 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2760 index 9e42842e924a..5398f97172f9 100644
2761 --- a/arch/x86/platform/uv/tlb_uv.c
2762 +++ b/arch/x86/platform/uv/tlb_uv.c
2763 @@ -748,9 +748,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2764
2765                 quiesce_local_uvhub(hmaster);
2766
2767 -               spin_lock(&hmaster->queue_lock);
2768 +               raw_spin_lock(&hmaster->queue_lock);
2769                 reset_with_ipi(&bau_desc->distribution, bcp);
2770 -               spin_unlock(&hmaster->queue_lock);
2771 +               raw_spin_unlock(&hmaster->queue_lock);
2772
2773                 end_uvhub_quiesce(hmaster);
2774
2775 @@ -770,9 +770,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2776
2777                 quiesce_local_uvhub(hmaster);
2778
2779 -               spin_lock(&hmaster->queue_lock);
2780 +               raw_spin_lock(&hmaster->queue_lock);
2781                 reset_with_ipi(&bau_desc->distribution, bcp);
2782 -               spin_unlock(&hmaster->queue_lock);
2783 +               raw_spin_unlock(&hmaster->queue_lock);
2784
2785                 end_uvhub_quiesce(hmaster);
2786
2787 @@ -793,7 +793,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2788         cycles_t tm1;
2789
2790         hmaster = bcp->uvhub_master;
2791 -       spin_lock(&hmaster->disable_lock);
2792 +       raw_spin_lock(&hmaster->disable_lock);
2793         if (!bcp->baudisabled) {
2794                 stat->s_bau_disabled++;
2795                 tm1 = get_cycles();
2796 @@ -806,7 +806,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2797                         }
2798                 }
2799         }
2800 -       spin_unlock(&hmaster->disable_lock);
2801 +       raw_spin_unlock(&hmaster->disable_lock);
2802  }
2803
2804  static void count_max_concurr(int stat, struct bau_control *bcp,
2805 @@ -869,7 +869,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2806   */
2807  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2808  {
2809 -       spinlock_t *lock = &hmaster->uvhub_lock;
2810 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2811         atomic_t *v;
2812
2813         v = &hmaster->active_descriptor_count;
2814 @@ -1002,7 +1002,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2815         struct bau_control *hmaster;
2816
2817         hmaster = bcp->uvhub_master;
2818 -       spin_lock(&hmaster->disable_lock);
2819 +       raw_spin_lock(&hmaster->disable_lock);
2820         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2821                 stat->s_bau_reenabled++;
2822                 for_each_present_cpu(tcpu) {
2823 @@ -1014,10 +1014,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2824                                 tbcp->period_giveups = 0;
2825                         }
2826                 }
2827 -               spin_unlock(&hmaster->disable_lock);
2828 +               raw_spin_unlock(&hmaster->disable_lock);
2829                 return 0;
2830         }
2831 -       spin_unlock(&hmaster->disable_lock);
2832 +       raw_spin_unlock(&hmaster->disable_lock);
2833         return -1;
2834  }
2835
2836 @@ -1940,9 +1940,9 @@ static void __init init_per_cpu_tunables(void)
2837                 bcp->cong_reps                  = congested_reps;
2838                 bcp->disabled_period            = sec_2_cycles(disabled_period);
2839                 bcp->giveup_limit               = giveup_limit;
2840 -               spin_lock_init(&bcp->queue_lock);
2841 -               spin_lock_init(&bcp->uvhub_lock);
2842 -               spin_lock_init(&bcp->disable_lock);
2843 +               raw_spin_lock_init(&bcp->queue_lock);
2844 +               raw_spin_lock_init(&bcp->uvhub_lock);
2845 +               raw_spin_lock_init(&bcp->disable_lock);
2846         }
2847  }
2848
2849 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2850 index b333fc45f9ec..8b85916e6986 100644
2851 --- a/arch/x86/platform/uv/uv_time.c
2852 +++ b/arch/x86/platform/uv/uv_time.c
2853 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2854
2855  /* There is one of these allocated per node */
2856  struct uv_rtc_timer_head {
2857 -       spinlock_t      lock;
2858 +       raw_spinlock_t  lock;
2859         /* next cpu waiting for timer, local node relative: */
2860         int             next_cpu;
2861         /* number of cpus on this node: */
2862 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2863                                 uv_rtc_deallocate_timers();
2864                                 return -ENOMEM;
2865                         }
2866 -                       spin_lock_init(&head->lock);
2867 +                       raw_spin_lock_init(&head->lock);
2868                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2869                         head->next_cpu = -1;
2870                         blade_info[bid] = head;
2871 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2872         unsigned long flags;
2873         int next_cpu;
2874
2875 -       spin_lock_irqsave(&head->lock, flags);
2876 +       raw_spin_lock_irqsave(&head->lock, flags);
2877
2878         next_cpu = head->next_cpu;
2879         *t = expires;
2880 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2881                 if (uv_setup_intr(cpu, expires)) {
2882                         *t = ULLONG_MAX;
2883                         uv_rtc_find_next_timer(head, pnode);
2884 -                       spin_unlock_irqrestore(&head->lock, flags);
2885 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2886                         return -ETIME;
2887                 }
2888         }
2889
2890 -       spin_unlock_irqrestore(&head->lock, flags);
2891 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2892         return 0;
2893  }
2894
2895 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2896         unsigned long flags;
2897         int rc = 0;
2898
2899 -       spin_lock_irqsave(&head->lock, flags);
2900 +       raw_spin_lock_irqsave(&head->lock, flags);
2901
2902         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2903                 rc = 1;
2904 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2905                         uv_rtc_find_next_timer(head, pnode);
2906         }
2907
2908 -       spin_unlock_irqrestore(&head->lock, flags);
2909 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2910
2911         return rc;
2912  }
2913 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2914  static cycle_t uv_read_rtc(struct clocksource *cs)
2915  {
2916         unsigned long offset;
2917 +       cycle_t cycles;
2918
2919 +       preempt_disable();
2920         if (uv_get_min_hub_revision_id() == 1)
2921                 offset = 0;
2922         else
2923                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2924
2925 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2926 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
2927 +       preempt_enable();
2928 +
2929 +       return cycles;
2930  }
2931
2932  /*
2933 diff --git a/block/blk-core.c b/block/blk-core.c
2934 index 14d7c0740dc0..dfd905bea77c 100644
2935 --- a/block/blk-core.c
2936 +++ b/block/blk-core.c
2937 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
2938
2939         INIT_LIST_HEAD(&rq->queuelist);
2940         INIT_LIST_HEAD(&rq->timeout_list);
2941 +#ifdef CONFIG_PREEMPT_RT_FULL
2942 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
2943 +#endif
2944         rq->cpu = -1;
2945         rq->q = q;
2946         rq->__sector = (sector_t) -1;
2947 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
2948   **/
2949  void blk_start_queue(struct request_queue *q)
2950  {
2951 -       WARN_ON(!irqs_disabled());
2952 +       WARN_ON_NONRT(!irqs_disabled());
2953
2954         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
2955         __blk_run_queue(q);
2956 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
2957                 if (nowait)
2958                         return -EBUSY;
2959
2960 -               ret = wait_event_interruptible(q->mq_freeze_wq,
2961 +               ret = swait_event_interruptible(q->mq_freeze_wq,
2962                                 !atomic_read(&q->mq_freeze_depth) ||
2963                                 blk_queue_dying(q));
2964                 if (blk_queue_dying(q))
2965 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
2966         struct request_queue *q =
2967                 container_of(ref, struct request_queue, q_usage_counter);
2968
2969 -       wake_up_all(&q->mq_freeze_wq);
2970 +       swake_up_all(&q->mq_freeze_wq);
2971  }
2972
2973  static void blk_rq_timed_out_timer(unsigned long data)
2974 @@ -748,7 +751,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
2975         q->bypass_depth = 1;
2976         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
2977
2978 -       init_waitqueue_head(&q->mq_freeze_wq);
2979 +       init_swait_queue_head(&q->mq_freeze_wq);
2980
2981         /*
2982          * Init percpu_ref in atomic mode so that it's faster to shutdown.
2983 @@ -3177,7 +3180,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
2984                 blk_run_queue_async(q);
2985         else
2986                 __blk_run_queue(q);
2987 -       spin_unlock(q->queue_lock);
2988 +       spin_unlock_irq(q->queue_lock);
2989  }
2990
2991  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
2992 @@ -3225,7 +3228,6 @@ EXPORT_SYMBOL(blk_check_plugged);
2993  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2994  {
2995         struct request_queue *q;
2996 -       unsigned long flags;
2997         struct request *rq;
2998         LIST_HEAD(list);
2999         unsigned int depth;
3000 @@ -3245,11 +3247,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3001         q = NULL;
3002         depth = 0;
3003
3004 -       /*
3005 -        * Save and disable interrupts here, to avoid doing it for every
3006 -        * queue lock we have to take.
3007 -        */
3008 -       local_irq_save(flags);
3009         while (!list_empty(&list)) {
3010                 rq = list_entry_rq(list.next);
3011                 list_del_init(&rq->queuelist);
3012 @@ -3262,7 +3259,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3013                                 queue_unplugged(q, depth, from_schedule);
3014                         q = rq->q;
3015                         depth = 0;
3016 -                       spin_lock(q->queue_lock);
3017 +                       spin_lock_irq(q->queue_lock);
3018                 }
3019
3020                 /*
3021 @@ -3289,8 +3286,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3022          */
3023         if (q)
3024                 queue_unplugged(q, depth, from_schedule);
3025 -
3026 -       local_irq_restore(flags);
3027  }
3028
3029  void blk_finish_plug(struct blk_plug *plug)
3030 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3031 index 381cb50a673c..dc8785233d94 100644
3032 --- a/block/blk-ioc.c
3033 +++ b/block/blk-ioc.c
3034 @@ -7,6 +7,7 @@
3035  #include <linux/bio.h>
3036  #include <linux/blkdev.h>
3037  #include <linux/slab.h>
3038 +#include <linux/delay.h>
3039
3040  #include "blk.h"
3041
3042 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3043                         spin_unlock(q->queue_lock);
3044                 } else {
3045                         spin_unlock_irqrestore(&ioc->lock, flags);
3046 -                       cpu_relax();
3047 +                       cpu_chill();
3048                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3049                 }
3050         }
3051 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3052                         spin_unlock(icq->q->queue_lock);
3053                 } else {
3054                         spin_unlock_irqrestore(&ioc->lock, flags);
3055 -                       cpu_relax();
3056 +                       cpu_chill();
3057                         goto retry;
3058                 }
3059         }
3060 diff --git a/block/blk-mq.c b/block/blk-mq.c
3061 index 81caceb96c3c..b12b0ab005a9 100644
3062 --- a/block/blk-mq.c
3063 +++ b/block/blk-mq.c
3064 @@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3065
3066  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3067  {
3068 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3069 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3070  }
3071
3072  /*
3073 @@ -110,7 +110,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3074         WARN_ON_ONCE(freeze_depth < 0);
3075         if (!freeze_depth) {
3076                 percpu_ref_reinit(&q->q_usage_counter);
3077 -               wake_up_all(&q->mq_freeze_wq);
3078 +               swake_up_all(&q->mq_freeze_wq);
3079         }
3080  }
3081  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3082 @@ -129,7 +129,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3083          * dying, we need to ensure that processes currently waiting on
3084          * the queue are notified as well.
3085          */
3086 -       wake_up_all(&q->mq_freeze_wq);
3087 +       swake_up_all(&q->mq_freeze_wq);
3088  }
3089
3090  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3091 @@ -177,6 +177,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3092         rq->resid_len = 0;
3093         rq->sense = NULL;
3094
3095 +#ifdef CONFIG_PREEMPT_RT_FULL
3096 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3097 +#endif
3098         INIT_LIST_HEAD(&rq->timeout_list);
3099         rq->timeout = 0;
3100
3101 @@ -345,6 +348,17 @@ void blk_mq_end_request(struct request *rq, int error)
3102  }
3103  EXPORT_SYMBOL(blk_mq_end_request);
3104
3105 +#ifdef CONFIG_PREEMPT_RT_FULL
3106 +
3107 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3108 +{
3109 +       struct request *rq = container_of(work, struct request, work);
3110 +
3111 +       rq->q->softirq_done_fn(rq);
3112 +}
3113 +
3114 +#else
3115 +
3116  static void __blk_mq_complete_request_remote(void *data)
3117  {
3118         struct request *rq = data;
3119 @@ -352,6 +366,8 @@ static void __blk_mq_complete_request_remote(void *data)
3120         rq->q->softirq_done_fn(rq);
3121  }
3122
3123 +#endif
3124 +
3125  static void blk_mq_ipi_complete_request(struct request *rq)
3126  {
3127         struct blk_mq_ctx *ctx = rq->mq_ctx;
3128 @@ -363,19 +379,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3129                 return;
3130         }
3131
3132 -       cpu = get_cpu();
3133 +       cpu = get_cpu_light();
3134         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3135                 shared = cpus_share_cache(cpu, ctx->cpu);
3136
3137         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3138 +#ifdef CONFIG_PREEMPT_RT_FULL
3139 +               schedule_work_on(ctx->cpu, &rq->work);
3140 +#else
3141                 rq->csd.func = __blk_mq_complete_request_remote;
3142                 rq->csd.info = rq;
3143                 rq->csd.flags = 0;
3144                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3145 +#endif
3146         } else {
3147                 rq->q->softirq_done_fn(rq);
3148         }
3149 -       put_cpu();
3150 +       put_cpu_light();
3151  }
3152
3153  static void __blk_mq_complete_request(struct request *rq)
3154 @@ -915,14 +935,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3155                 return;
3156
3157         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
3158 -               int cpu = get_cpu();
3159 +               int cpu = get_cpu_light();
3160                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3161                         __blk_mq_run_hw_queue(hctx);
3162 -                       put_cpu();
3163 +                       put_cpu_light();
3164                         return;
3165                 }
3166
3167 -               put_cpu();
3168 +               put_cpu_light();
3169         }
3170
3171         kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
3172 diff --git a/block/blk-mq.h b/block/blk-mq.h
3173 index e5d25249028c..1e846b842eab 100644
3174 --- a/block/blk-mq.h
3175 +++ b/block/blk-mq.h
3176 @@ -72,12 +72,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3177   */
3178  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3179  {
3180 -       return __blk_mq_get_ctx(q, get_cpu());
3181 +       return __blk_mq_get_ctx(q, get_cpu_light());
3182  }
3183
3184  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3185  {
3186 -       put_cpu();
3187 +       put_cpu_light();
3188  }
3189
3190  struct blk_mq_alloc_data {
3191 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3192 index 06cf9807f49a..c40342643ca0 100644
3193 --- a/block/blk-softirq.c
3194 +++ b/block/blk-softirq.c
3195 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3196                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3197
3198         local_irq_restore(flags);
3199 +       preempt_check_resched_rt();
3200  }
3201
3202  /*
3203 @@ -89,6 +90,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
3204                          this_cpu_ptr(&blk_cpu_done));
3205         raise_softirq_irqoff(BLOCK_SOFTIRQ);
3206         local_irq_enable();
3207 +       preempt_check_resched_rt();
3208
3209         return 0;
3210  }
3211 @@ -141,6 +143,7 @@ void __blk_complete_request(struct request *req)
3212                 goto do_local;
3213
3214         local_irq_restore(flags);
3215 +       preempt_check_resched_rt();
3216  }
3217
3218  /**
3219 diff --git a/block/bounce.c b/block/bounce.c
3220 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3221 --- a/block/bounce.c
3222 +++ b/block/bounce.c
3223 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3224         unsigned long flags;
3225         unsigned char *vto;
3226
3227 -       local_irq_save(flags);
3228 +       local_irq_save_nort(flags);
3229         vto = kmap_atomic(to->bv_page);
3230         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3231         kunmap_atomic(vto);
3232 -       local_irq_restore(flags);
3233 +       local_irq_restore_nort(flags);
3234  }
3235
3236  #else /* CONFIG_HIGHMEM */
3237 diff --git a/crypto/algapi.c b/crypto/algapi.c
3238 index df939b54b09f..efe5e06adcf7 100644
3239 --- a/crypto/algapi.c
3240 +++ b/crypto/algapi.c
3241 @@ -718,13 +718,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3242
3243  int crypto_register_notifier(struct notifier_block *nb)
3244  {
3245 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3246 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3247  }
3248  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3249
3250  int crypto_unregister_notifier(struct notifier_block *nb)
3251  {
3252 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3253 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3254  }
3255  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3256
3257 diff --git a/crypto/api.c b/crypto/api.c
3258 index bbc147cb5dec..bc1a848f02ec 100644
3259 --- a/crypto/api.c
3260 +++ b/crypto/api.c
3261 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3262  DECLARE_RWSEM(crypto_alg_sem);
3263  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3264
3265 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3266 +SRCU_NOTIFIER_HEAD(crypto_chain);
3267  EXPORT_SYMBOL_GPL(crypto_chain);
3268
3269  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3270 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3271  {
3272         int ok;
3273
3274 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3275 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3276         if (ok == NOTIFY_DONE) {
3277                 request_module("cryptomgr");
3278 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3279 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3280         }
3281
3282         return ok;
3283 diff --git a/crypto/internal.h b/crypto/internal.h
3284 index 7eefcdb00227..0ecc7f5a2f40 100644
3285 --- a/crypto/internal.h
3286 +++ b/crypto/internal.h
3287 @@ -47,7 +47,7 @@ struct crypto_larval {
3288
3289  extern struct list_head crypto_alg_list;
3290  extern struct rw_semaphore crypto_alg_sem;
3291 -extern struct blocking_notifier_head crypto_chain;
3292 +extern struct srcu_notifier_head crypto_chain;
3293
3294  #ifdef CONFIG_PROC_FS
3295  void __init crypto_init_proc(void);
3296 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3297
3298  static inline void crypto_notify(unsigned long val, void *v)
3299  {
3300 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3301 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3302  }
3303
3304  #endif /* _CRYPTO_INTERNAL_H */
3305 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3306 index 750fa824d42c..441edf51484a 100644
3307 --- a/drivers/acpi/acpica/acglobal.h
3308 +++ b/drivers/acpi/acpica/acglobal.h
3309 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3310   * interrupt level
3311   */
3312  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3313 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3314 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3315  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3316
3317  /* Mutex for _OSI support */
3318 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3319 index 3b7fb99362b6..696bf8e62afb 100644
3320 --- a/drivers/acpi/acpica/hwregs.c
3321 +++ b/drivers/acpi/acpica/hwregs.c
3322 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3323                           ACPI_BITMASK_ALL_FIXED_STATUS,
3324                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3325
3326 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3327 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3328
3329         /* Clear the fixed events in PM1 A/B */
3330
3331         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3332                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3333
3334 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3335 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3336
3337         if (ACPI_FAILURE(status)) {
3338                 goto exit;
3339 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3340 index 98c26ff39409..6e236f2ea791 100644
3341 --- a/drivers/acpi/acpica/hwxface.c
3342 +++ b/drivers/acpi/acpica/hwxface.c
3343 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3344                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3345         }
3346
3347 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3348 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3349
3350         /*
3351          * At this point, we know that the parent register is one of the
3352 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3353
3354  unlock_and_exit:
3355
3356 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3357 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3358         return_ACPI_STATUS(status);
3359  }
3360
3361 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3362 index 15073375bd00..357e7ca5a587 100644
3363 --- a/drivers/acpi/acpica/utmutex.c
3364 +++ b/drivers/acpi/acpica/utmutex.c
3365 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3366                 return_ACPI_STATUS (status);
3367         }
3368
3369 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3370 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3371         if (ACPI_FAILURE (status)) {
3372                 return_ACPI_STATUS (status);
3373         }
3374 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3375         /* Delete the spinlocks */
3376
3377         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3378 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3379 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3380         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3381
3382         /* Delete the reader/writer lock */
3383 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3384 index 051b6158d1b7..7ad293bef6ed 100644
3385 --- a/drivers/ata/libata-sff.c
3386 +++ b/drivers/ata/libata-sff.c
3387 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3388         unsigned long flags;
3389         unsigned int consumed;
3390
3391 -       local_irq_save(flags);
3392 +       local_irq_save_nort(flags);
3393         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3394 -       local_irq_restore(flags);
3395 +       local_irq_restore_nort(flags);
3396
3397         return consumed;
3398  }
3399 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3400                 unsigned long flags;
3401
3402                 /* FIXME: use a bounce buffer */
3403 -               local_irq_save(flags);
3404 +               local_irq_save_nort(flags);
3405                 buf = kmap_atomic(page);
3406
3407                 /* do the actual data transfer */
3408 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3409                                        do_write);
3410
3411                 kunmap_atomic(buf);
3412 -               local_irq_restore(flags);
3413 +               local_irq_restore_nort(flags);
3414         } else {
3415                 buf = page_address(page);
3416                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3417 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3418                 unsigned long flags;
3419
3420                 /* FIXME: use bounce buffer */
3421 -               local_irq_save(flags);
3422 +               local_irq_save_nort(flags);
3423                 buf = kmap_atomic(page);
3424
3425                 /* do the actual data transfer */
3426 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3427                                                                 count, rw);
3428
3429                 kunmap_atomic(buf);
3430 -               local_irq_restore(flags);
3431 +               local_irq_restore_nort(flags);
3432         } else {
3433                 buf = page_address(page);
3434                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3435 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3436 index 4b5cd3a7b2b6..fa8329ad79fd 100644
3437 --- a/drivers/block/zram/zcomp.c
3438 +++ b/drivers/block/zram/zcomp.c
3439 @@ -118,12 +118,19 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3440
3441  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3442  {
3443 -       return *get_cpu_ptr(comp->stream);
3444 +       struct zcomp_strm *zstrm;
3445 +
3446 +       zstrm = *this_cpu_ptr(comp->stream);
3447 +       spin_lock(&zstrm->zcomp_lock);
3448 +       return zstrm;
3449  }
3450
3451  void zcomp_stream_put(struct zcomp *comp)
3452  {
3453 -       put_cpu_ptr(comp->stream);
3454 +       struct zcomp_strm *zstrm;
3455 +
3456 +       zstrm = *this_cpu_ptr(comp->stream);
3457 +       spin_unlock(&zstrm->zcomp_lock);
3458  }
3459
3460  int zcomp_compress(struct zcomp_strm *zstrm,
3461 @@ -174,6 +181,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3462                         pr_err("Can't allocate a compression stream\n");
3463                         return NOTIFY_BAD;
3464                 }
3465 +               spin_lock_init(&zstrm->zcomp_lock);
3466                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3467                 break;
3468         case CPU_DEAD:
3469 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3470 index 478cac2ed465..f7a6efdc3285 100644
3471 --- a/drivers/block/zram/zcomp.h
3472 +++ b/drivers/block/zram/zcomp.h
3473 @@ -14,6 +14,7 @@ struct zcomp_strm {
3474         /* compression/decompression buffer */
3475         void *buffer;
3476         struct crypto_comp *tfm;
3477 +       spinlock_t zcomp_lock;
3478  };
3479
3480  /* dynamic per-device compression frontend */
3481 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3482 index d2ef51ca9cf4..05e749736560 100644
3483 --- a/drivers/block/zram/zram_drv.c
3484 +++ b/drivers/block/zram/zram_drv.c
3485 @@ -528,6 +528,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3486                 goto out_error;
3487         }
3488
3489 +       zram_meta_init_table_locks(meta, disksize);
3490 +
3491         return meta;
3492
3493  out_error:
3494 @@ -575,28 +577,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3495         struct zram_meta *meta = zram->meta;
3496         unsigned long handle;
3497         unsigned int size;
3498 +       struct zcomp_strm *zstrm;
3499
3500 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3501 +       zram_lock_table(&meta->table[index]);
3502         handle = meta->table[index].handle;
3503         size = zram_get_obj_size(meta, index);
3504
3505         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3506 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3507 +               zram_unlock_table(&meta->table[index]);
3508                 clear_page(mem);
3509                 return 0;
3510         }
3511
3512 +       zstrm = zcomp_stream_get(zram->comp);
3513         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3514         if (size == PAGE_SIZE) {
3515                 copy_page(mem, cmem);
3516         } else {
3517 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3518 -
3519                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3520 -               zcomp_stream_put(zram->comp);
3521         }
3522         zs_unmap_object(meta->mem_pool, handle);
3523 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3524 +       zcomp_stream_put(zram->comp);
3525 +       zram_unlock_table(&meta->table[index]);
3526
3527         /* Should NEVER happen. Return bio error if it does. */
3528         if (unlikely(ret)) {
3529 @@ -616,14 +618,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3530         struct zram_meta *meta = zram->meta;
3531         page = bvec->bv_page;
3532
3533 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3534 +       zram_lock_table(&meta->table[index]);
3535         if (unlikely(!meta->table[index].handle) ||
3536                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3537 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3538 +               zram_unlock_table(&meta->table[index]);
3539                 handle_zero_page(bvec);
3540                 return 0;
3541         }
3542 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3543 +       zram_unlock_table(&meta->table[index]);
3544
3545         if (is_partial_io(bvec))
3546                 /* Use  a temporary buffer to decompress the page */
3547 @@ -700,10 +702,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3548                 if (user_mem)
3549                         kunmap_atomic(user_mem);
3550                 /* Free memory associated with this sector now. */
3551 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3552 +               zram_lock_table(&meta->table[index]);
3553                 zram_free_page(zram, index);
3554                 zram_set_flag(meta, index, ZRAM_ZERO);
3555 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3556 +               zram_unlock_table(&meta->table[index]);
3557
3558                 atomic64_inc(&zram->stats.zero_pages);
3559                 ret = 0;
3560 @@ -794,12 +796,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3561          * Free memory associated with this sector
3562          * before overwriting unused sectors.
3563          */
3564 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3565 +       zram_lock_table(&meta->table[index]);
3566         zram_free_page(zram, index);
3567
3568         meta->table[index].handle = handle;
3569         zram_set_obj_size(meta, index, clen);
3570 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3571 +       zram_unlock_table(&meta->table[index]);
3572
3573         /* Update stats */
3574         atomic64_add(clen, &zram->stats.compr_data_size);
3575 @@ -842,9 +844,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3576         }
3577
3578         while (n >= PAGE_SIZE) {
3579 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3580 +               zram_lock_table(&meta->table[index]);
3581                 zram_free_page(zram, index);
3582 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3583 +               zram_unlock_table(&meta->table[index]);
3584                 atomic64_inc(&zram->stats.notify_free);
3585                 index++;
3586                 n -= PAGE_SIZE;
3587 @@ -973,9 +975,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3588         zram = bdev->bd_disk->private_data;
3589         meta = zram->meta;
3590
3591 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3592 +       zram_lock_table(&meta->table[index]);
3593         zram_free_page(zram, index);
3594 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3595 +       zram_unlock_table(&meta->table[index]);
3596         atomic64_inc(&zram->stats.notify_free);
3597  }
3598
3599 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3600 index 74fcf10da374..fd4020c99b9e 100644
3601 --- a/drivers/block/zram/zram_drv.h
3602 +++ b/drivers/block/zram/zram_drv.h
3603 @@ -73,6 +73,9 @@ enum zram_pageflags {
3604  struct zram_table_entry {
3605         unsigned long handle;
3606         unsigned long value;
3607 +#ifdef CONFIG_PREEMPT_RT_BASE
3608 +       spinlock_t lock;
3609 +#endif
3610  };
3611
3612  struct zram_stats {
3613 @@ -120,4 +123,42 @@ struct zram {
3614          */
3615         bool claim; /* Protected by bdev->bd_mutex */
3616  };
3617 +
3618 +#ifndef CONFIG_PREEMPT_RT_BASE
3619 +static inline void zram_lock_table(struct zram_table_entry *table)
3620 +{
3621 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3622 +}
3623 +
3624 +static inline void zram_unlock_table(struct zram_table_entry *table)
3625 +{
3626 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3627 +}
3628 +
3629 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3630 +#else /* CONFIG_PREEMPT_RT_BASE */
3631 +static inline void zram_lock_table(struct zram_table_entry *table)
3632 +{
3633 +       spin_lock(&table->lock);
3634 +       __set_bit(ZRAM_ACCESS, &table->value);
3635 +}
3636 +
3637 +static inline void zram_unlock_table(struct zram_table_entry *table)
3638 +{
3639 +       __clear_bit(ZRAM_ACCESS, &table->value);
3640 +       spin_unlock(&table->lock);
3641 +}
3642 +
3643 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3644 +{
3645 +        size_t num_pages = disksize >> PAGE_SHIFT;
3646 +        size_t index;
3647 +
3648 +        for (index = 0; index < num_pages; index++) {
3649 +               spinlock_t *lock = &meta->table[index].lock;
3650 +               spin_lock_init(lock);
3651 +        }
3652 +}
3653 +#endif /* CONFIG_PREEMPT_RT_BASE */
3654 +
3655  #endif
3656 diff --git a/drivers/char/random.c b/drivers/char/random.c
3657 index d6876d506220..0c60b1e54579 100644
3658 --- a/drivers/char/random.c
3659 +++ b/drivers/char/random.c
3660 @@ -1028,8 +1028,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3661         } sample;
3662         long delta, delta2, delta3;
3663
3664 -       preempt_disable();
3665 -
3666         sample.jiffies = jiffies;
3667         sample.cycles = random_get_entropy();
3668         sample.num = num;
3669 @@ -1070,7 +1068,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3670                  */
3671                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3672         }
3673 -       preempt_enable();
3674  }
3675
3676  void add_input_randomness(unsigned int type, unsigned int code,
3677 @@ -1123,28 +1120,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3678         return *(ptr + f->reg_idx++);
3679  }
3680
3681 -void add_interrupt_randomness(int irq, int irq_flags)
3682 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3683  {
3684         struct entropy_store    *r;
3685         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3686 -       struct pt_regs          *regs = get_irq_regs();
3687         unsigned long           now = jiffies;
3688         cycles_t                cycles = random_get_entropy();
3689         __u32                   c_high, j_high;
3690 -       __u64                   ip;
3691         unsigned long           seed;
3692         int                     credit = 0;
3693
3694         if (cycles == 0)
3695 -               cycles = get_reg(fast_pool, regs);
3696 +               cycles = get_reg(fast_pool, NULL);
3697         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3698         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3699         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3700         fast_pool->pool[1] ^= now ^ c_high;
3701 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3702 +       if (!ip)
3703 +               ip = _RET_IP_;
3704         fast_pool->pool[2] ^= ip;
3705         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3706 -               get_reg(fast_pool, regs);
3707 +               get_reg(fast_pool, NULL);
3708
3709         fast_mix(fast_pool);
3710         add_interrupt_bench(cycles);
3711 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3712 index 4da2af9694a2..5b6f57f500b8 100644
3713 --- a/drivers/clocksource/tcb_clksrc.c
3714 +++ b/drivers/clocksource/tcb_clksrc.c
3715 @@ -23,8 +23,7 @@
3716   *     this 32 bit free-running counter. the second channel is not used.
3717   *
3718   *   - The third channel may be used to provide a 16-bit clockevent
3719 - *     source, used in either periodic or oneshot mode.  This runs
3720 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3721 + *     source, used in either periodic or oneshot mode.
3722   *
3723   * A boot clocksource and clockevent source are also currently needed,
3724   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3725 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3726  struct tc_clkevt_device {
3727         struct clock_event_device       clkevt;
3728         struct clk                      *clk;
3729 +       bool                            clk_enabled;
3730 +       u32                             freq;
3731         void __iomem                    *regs;
3732  };
3733
3734 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3735         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3736  }
3737
3738 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3739 - * because using one of the divided clocks would usually mean the
3740 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3741 - *
3742 - * A divided clock could be good for high resolution timers, since
3743 - * 30.5 usec resolution can seem "low".
3744 - */
3745  static u32 timer_clock;
3746
3747 +static void tc_clk_disable(struct clock_event_device *d)
3748 +{
3749 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3750 +
3751 +       clk_disable(tcd->clk);
3752 +       tcd->clk_enabled = false;
3753 +}
3754 +
3755 +static void tc_clk_enable(struct clock_event_device *d)
3756 +{
3757 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3758 +
3759 +       if (tcd->clk_enabled)
3760 +               return;
3761 +       clk_enable(tcd->clk);
3762 +       tcd->clk_enabled = true;
3763 +}
3764 +
3765  static int tc_shutdown(struct clock_event_device *d)
3766  {
3767         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3768 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3769
3770         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3771         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3772 +       return 0;
3773 +}
3774 +
3775 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3776 +{
3777 +       tc_shutdown(d);
3778         if (!clockevent_state_detached(d))
3779 -               clk_disable(tcd->clk);
3780 +               tc_clk_disable(d);
3781
3782         return 0;
3783  }
3784 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3785         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3786                 tc_shutdown(d);
3787
3788 -       clk_enable(tcd->clk);
3789 +       tc_clk_enable(d);
3790
3791 -       /* slow clock, count up to RC, then irq and stop */
3792 +       /* count up to RC, then irq and stop */
3793         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3794                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3795         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3796 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3797         /* By not making the gentime core emulate periodic mode on top
3798          * of oneshot, we get lower overhead and improved accuracy.
3799          */
3800 -       clk_enable(tcd->clk);
3801 +       tc_clk_enable(d);
3802
3803 -       /* slow clock, count up to RC, then irq and restart */
3804 +       /* count up to RC, then irq and restart */
3805         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3806                      regs + ATMEL_TC_REG(2, CMR));
3807 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3808 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3809
3810         /* Enable clock and interrupts on RC compare */
3811         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3812 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3813                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3814                                           CLOCK_EVT_FEAT_ONESHOT,
3815                 /* Should be lower than at91rm9200's system timer */
3816 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3817                 .rating                 = 125,
3818 +#else
3819 +               .rating                 = 200,
3820 +#endif
3821                 .set_next_event         = tc_next_event,
3822 -               .set_state_shutdown     = tc_shutdown,
3823 +               .set_state_shutdown     = tc_shutdown_clk_off,
3824                 .set_state_periodic     = tc_set_periodic,
3825                 .set_state_oneshot      = tc_set_oneshot,
3826         },
3827 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3828         return IRQ_NONE;
3829  }
3830
3831 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3832 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3833  {
3834 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3835         int ret;
3836         struct clk *t2_clk = tc->clk[2];
3837         int irq = tc->irq[2];
3838 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3839         clkevt.regs = tc->regs;
3840         clkevt.clk = t2_clk;
3841
3842 -       timer_clock = clk32k_divisor_idx;
3843 +       timer_clock = divisor_idx;
3844 +       if (!divisor)
3845 +               clkevt.freq = 32768;
3846 +       else
3847 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3848
3849         clkevt.clkevt.cpumask = cpumask_of(0);
3850
3851 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3852                 return ret;
3853         }
3854
3855 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3856 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3857
3858         return ret;
3859  }
3860 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
3861                 goto err_disable_t1;
3862
3863         /* channel 2:  periodic and oneshot timer support */
3864 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3865         ret = setup_clkevents(tc, clk32k_divisor_idx);
3866 +#else
3867 +       ret = setup_clkevents(tc, best_divisor_idx);
3868 +#endif
3869         if (ret)
3870                 goto err_unregister_clksrc;
3871
3872 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
3873 index 6555821bbdae..93288849b2bd 100644
3874 --- a/drivers/clocksource/timer-atmel-pit.c
3875 +++ b/drivers/clocksource/timer-atmel-pit.c
3876 @@ -46,6 +46,7 @@ struct pit_data {
3877         u32             cycle;
3878         u32             cnt;
3879         unsigned int    irq;
3880 +       bool            irq_requested;
3881         struct clk      *mck;
3882  };
3883
3884 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
3885
3886         /* disable irq, leaving the clocksource active */
3887         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
3888 +       if (data->irq_requested) {
3889 +               free_irq(data->irq, data);
3890 +               data->irq_requested = false;
3891 +       }
3892         return 0;
3893  }
3894
3895 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
3896  /*
3897   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
3898   */
3899  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
3900  {
3901         struct pit_data *data = clkevt_to_pit_data(dev);
3902 +       int ret;
3903 +
3904 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3905 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3906 +                         "at91_tick", data);
3907 +       if (ret)
3908 +               panic(pr_fmt("Unable to setup IRQ\n"));
3909 +
3910 +       data->irq_requested = true;
3911
3912         /* update clocksource counter */
3913         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
3914 @@ -230,15 +245,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node)
3915                 return ret;
3916         }
3917
3918 -       /* Set up irq handler */
3919 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
3920 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3921 -                         "at91_tick", data);
3922 -       if (ret) {
3923 -               pr_err("Unable to setup IRQ\n");
3924 -               return ret;
3925 -       }
3926 -
3927         /* Set up and register clockevents */
3928         data->clkevt.name = "pit";
3929         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
3930 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
3931 index e90ab5b63a90..9e124087c55f 100644
3932 --- a/drivers/clocksource/timer-atmel-st.c
3933 +++ b/drivers/clocksource/timer-atmel-st.c
3934 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
3935         last_crtr = read_CRTR();
3936  }
3937
3938 +static int atmel_st_irq;
3939 +
3940  static int clkevt32k_shutdown(struct clock_event_device *evt)
3941  {
3942         clkdev32k_disable_and_flush_irq();
3943         irqmask = 0;
3944         regmap_write(regmap_st, AT91_ST_IER, irqmask);
3945 +       free_irq(atmel_st_irq, regmap_st);
3946         return 0;
3947  }
3948
3949  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3950  {
3951 +       int ret;
3952 +
3953         clkdev32k_disable_and_flush_irq();
3954
3955 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3956 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3957 +                         "at91_tick", regmap_st);
3958 +       if (ret)
3959 +               panic(pr_fmt("Unable to setup IRQ\n"));
3960 +
3961         /*
3962          * ALM for oneshot irqs, set by next_event()
3963          * before 32 seconds have passed.
3964 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
3965
3966  static int clkevt32k_set_periodic(struct clock_event_device *dev)
3967  {
3968 +       int ret;
3969 +
3970         clkdev32k_disable_and_flush_irq();
3971
3972 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
3973 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
3974 +                         "at91_tick", regmap_st);
3975 +       if (ret)
3976 +               panic(pr_fmt("Unable to setup IRQ\n"));
3977 +
3978         /* PIT for periodic irqs; fixed rate of 1/HZ */
3979         irqmask = AT91_ST_PITS;
3980         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
3981 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
3982  {
3983         struct clk *sclk;
3984         unsigned int sclk_rate, val;
3985 -       int irq, ret;
3986 +       int ret;
3987
3988         regmap_st = syscon_node_to_regmap(node);
3989         if (IS_ERR(regmap_st)) {
3990 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
3991         regmap_read(regmap_st, AT91_ST_SR, &val);
3992
3993         /* Get the interrupts property */
3994 -       irq  = irq_of_parse_and_map(node, 0);
3995 -       if (!irq) {
3996 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
3997 +       if (!atmel_st_irq) {
3998                 pr_err("Unable to get IRQ from DT\n");
3999                 return -EINVAL;
4000         }
4001
4002 -       /* Make IRQs happen for the system timer */
4003 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
4004 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4005 -                         "at91_tick", regmap_st);
4006 -       if (ret) {
4007 -               pr_err("Unable to setup IRQ\n");
4008 -               return ret;
4009 -       }
4010 -
4011         sclk = of_clk_get(node, 0);
4012         if (IS_ERR(sclk)) {
4013                 pr_err("Unable to get slow clock\n");
4014 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
4015 index a782ce87715c..19d265948526 100644
4016 --- a/drivers/connector/cn_proc.c
4017 +++ b/drivers/connector/cn_proc.c
4018 @@ -32,6 +32,7 @@
4019  #include <linux/pid_namespace.h>
4020
4021  #include <linux/cn_proc.h>
4022 +#include <linux/locallock.h>
4023
4024  /*
4025   * Size of a cn_msg followed by a proc_event structure.  Since the
4026 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4027
4028  /* proc_event_counts is used as the sequence number of the netlink message */
4029  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4030 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4031
4032  static inline void send_msg(struct cn_msg *msg)
4033  {
4034 -       preempt_disable();
4035 +       local_lock(send_msg_lock);
4036
4037         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4038         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4039 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4040          */
4041         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4042
4043 -       preempt_enable();
4044 +       local_unlock(send_msg_lock);
4045  }
4046
4047  void proc_fork_connector(struct task_struct *task)
4048 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4049 index adbd1de1cea5..1fac5074f2cf 100644
4050 --- a/drivers/cpufreq/Kconfig.x86
4051 +++ b/drivers/cpufreq/Kconfig.x86
4052 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4053
4054  config X86_POWERNOW_K8
4055         tristate "AMD Opteron/Athlon64 PowerNow!"
4056 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4057 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4058         help
4059           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4060           Support for K10 and newer processors is now in acpi-cpufreq.
4061 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4062 index a218c2e395e7..5273d8f1d5dd 100644
4063 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4064 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4065 @@ -1537,7 +1537,9 @@ execbuf_submit(struct i915_execbuffer_params *params,
4066         if (ret)
4067                 return ret;
4068
4069 +#ifndef CONFIG_PREEMPT_RT_BASE
4070         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4071 +#endif
4072
4073         i915_gem_execbuffer_move_to_active(vmas, params->request);
4074
4075 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4076 index 1c237d02f30b..9e9b4404c0d7 100644
4077 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4078 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4079 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4080         if (!mutex_is_locked(mutex))
4081                 return false;
4082
4083 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4084 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4085         return mutex->owner == task;
4086  #else
4087         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4088 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4089 index 3fc286cd1157..252a1117b103 100644
4090 --- a/drivers/gpu/drm/i915/i915_irq.c
4091 +++ b/drivers/gpu/drm/i915/i915_irq.c
4092 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4093         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4094
4095         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4096 +       preempt_disable_rt();
4097
4098         /* Get optional system timestamp before query. */
4099         if (stime)
4100 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4101                 *etime = ktime_get();
4102
4103         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4104 +       preempt_enable_rt();
4105
4106         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4107
4108 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4109 index 869b29fe9ec4..c8b8788d9d36 100644
4110 --- a/drivers/gpu/drm/i915/intel_display.c
4111 +++ b/drivers/gpu/drm/i915/intel_display.c
4112 @@ -12131,7 +12131,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4113         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4114         struct intel_flip_work *work;
4115
4116 -       WARN_ON(!in_interrupt());
4117 +       WARN_ON_NONRT(!in_interrupt());
4118
4119         if (crtc == NULL)
4120                 return;
4121 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4122 index dbed12c484c9..5c540b78e8b5 100644
4123 --- a/drivers/gpu/drm/i915/intel_sprite.c
4124 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4125 @@ -35,6 +35,7 @@
4126  #include <drm/drm_rect.h>
4127  #include <drm/drm_atomic.h>
4128  #include <drm/drm_plane_helper.h>
4129 +#include <linux/locallock.h>
4130  #include "intel_drv.h"
4131  #include "intel_frontbuffer.h"
4132  #include <drm/i915_drm.h>
4133 @@ -65,6 +66,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4134                             1000 * adjusted_mode->crtc_htotal);
4135  }
4136
4137 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4138 +
4139  /**
4140   * intel_pipe_update_start() - start update of a set of display registers
4141   * @crtc: the crtc of which the registers are going to be updated
4142 @@ -95,7 +98,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4143         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4144         max = vblank_start - 1;
4145
4146 -       local_irq_disable();
4147 +       local_lock_irq(pipe_update_lock);
4148
4149         if (min <= 0 || max <= 0)
4150                 return;
4151 @@ -125,11 +128,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4152                         break;
4153                 }
4154
4155 -               local_irq_enable();
4156 +               local_unlock_irq(pipe_update_lock);
4157
4158                 timeout = schedule_timeout(timeout);
4159
4160 -               local_irq_disable();
4161 +               local_lock_irq(pipe_update_lock);
4162         }
4163
4164         finish_wait(wq, &wait);
4165 @@ -181,7 +184,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4166                 crtc->base.state->event = NULL;
4167         }
4168
4169 -       local_irq_enable();
4170 +       local_unlock_irq(pipe_update_lock);
4171
4172         if (crtc->debug.start_vbl_count &&
4173             crtc->debug.start_vbl_count != end_vbl_count) {
4174 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4175 index 192b2d3a79cb..d5372a207326 100644
4176 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4177 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4178 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4179         if (!mutex_is_locked(mutex))
4180                 return false;
4181
4182 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4183 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4184         return mutex->owner == task;
4185  #else
4186         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4187 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4188 index cdb8cb568c15..b6d7fd964cbc 100644
4189 --- a/drivers/gpu/drm/radeon/radeon_display.c
4190 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4191 @@ -1845,6 +1845,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4192         struct radeon_device *rdev = dev->dev_private;
4193
4194         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4195 +       preempt_disable_rt();
4196
4197         /* Get optional system timestamp before query. */
4198         if (stime)
4199 @@ -1937,6 +1938,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4200                 *etime = ktime_get();
4201
4202         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4203 +       preempt_enable_rt();
4204
4205         /* Decode into vertical and horizontal scanout position. */
4206         *vpos = position & 0x1fff;
4207 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4208 index 0276d2ef06ee..8868045eabde 100644
4209 --- a/drivers/hv/vmbus_drv.c
4210 +++ b/drivers/hv/vmbus_drv.c
4211 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4212         void *page_addr;
4213         struct hv_message *msg;
4214         union hv_synic_event_flags *event;
4215 +       struct pt_regs *regs = get_irq_regs();
4216 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4217         bool handled = false;
4218
4219         page_addr = hv_context.synic_event_page[cpu];
4220 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4221                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4222         }
4223
4224 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4225 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4226  }
4227
4228
4229 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4230 index 36f76e28a0bf..394f142f90c7 100644
4231 --- a/drivers/ide/alim15x3.c
4232 +++ b/drivers/ide/alim15x3.c
4233 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4234
4235         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4236
4237 -       local_irq_save(flags);
4238 +       local_irq_save_nort(flags);
4239
4240         if (m5229_revision < 0xC2) {
4241                 /*
4242 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4243         }
4244         pci_dev_put(north);
4245         pci_dev_put(isa_dev);
4246 -       local_irq_restore(flags);
4247 +       local_irq_restore_nort(flags);
4248         return 0;
4249  }
4250
4251 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4252 index 0ceae5cbd89a..c212e85d7f3e 100644
4253 --- a/drivers/ide/hpt366.c
4254 +++ b/drivers/ide/hpt366.c
4255 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4256
4257         dma_old = inb(base + 2);
4258
4259 -       local_irq_save(flags);
4260 +       local_irq_save_nort(flags);
4261
4262         dma_new = dma_old;
4263         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4264 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4265         if (dma_new != dma_old)
4266                 outb(dma_new, base + 2);
4267
4268 -       local_irq_restore(flags);
4269 +       local_irq_restore_nort(flags);
4270
4271         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4272                          hwif->name, base, base + 7);
4273 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4274 index 19763977568c..4169433faab5 100644
4275 --- a/drivers/ide/ide-io-std.c
4276 +++ b/drivers/ide/ide-io-std.c
4277 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4278                 unsigned long uninitialized_var(flags);
4279
4280                 if ((io_32bit & 2) && !mmio) {
4281 -                       local_irq_save(flags);
4282 +                       local_irq_save_nort(flags);
4283                         ata_vlb_sync(io_ports->nsect_addr);
4284                 }
4285
4286 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4287                         insl(data_addr, buf, words);
4288
4289                 if ((io_32bit & 2) && !mmio)
4290 -                       local_irq_restore(flags);
4291 +                       local_irq_restore_nort(flags);
4292
4293                 if (((len + 1) & 3) < 2)
4294                         return;
4295 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4296                 unsigned long uninitialized_var(flags);
4297
4298                 if ((io_32bit & 2) && !mmio) {
4299 -                       local_irq_save(flags);
4300 +                       local_irq_save_nort(flags);
4301                         ata_vlb_sync(io_ports->nsect_addr);
4302                 }
4303
4304 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4305                         outsl(data_addr, buf, words);
4306
4307                 if ((io_32bit & 2) && !mmio)
4308 -                       local_irq_restore(flags);
4309 +                       local_irq_restore_nort(flags);
4310
4311                 if (((len + 1) & 3) < 2)
4312                         return;
4313 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4314 index 669ea1e45795..e12e43e62245 100644
4315 --- a/drivers/ide/ide-io.c
4316 +++ b/drivers/ide/ide-io.c
4317 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4318                 /* disable_irq_nosync ?? */
4319                 disable_irq(hwif->irq);
4320                 /* local CPU only, as if we were handling an interrupt */
4321 -               local_irq_disable();
4322 +               local_irq_disable_nort();
4323                 if (hwif->polling) {
4324                         startstop = handler(drive);
4325                 } else if (drive_is_ready(drive)) {
4326 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4327 index 376f2dc410c5..f014dd1b73dc 100644
4328 --- a/drivers/ide/ide-iops.c
4329 +++ b/drivers/ide/ide-iops.c
4330 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4331                                 if ((stat & ATA_BUSY) == 0)
4332                                         break;
4333
4334 -                               local_irq_restore(flags);
4335 +                               local_irq_restore_nort(flags);
4336                                 *rstat = stat;
4337                                 return -EBUSY;
4338                         }
4339                 }
4340 -               local_irq_restore(flags);
4341 +               local_irq_restore_nort(flags);
4342         }
4343         /*
4344          * Allow status to settle, then read it again.
4345 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4346 index 0b63facd1d87..4ceba37afc0c 100644
4347 --- a/drivers/ide/ide-probe.c
4348 +++ b/drivers/ide/ide-probe.c
4349 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4350         int bswap = 1;
4351
4352         /* local CPU only; some systems need this */
4353 -       local_irq_save(flags);
4354 +       local_irq_save_nort(flags);
4355         /* read 512 bytes of id info */
4356         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4357 -       local_irq_restore(flags);
4358 +       local_irq_restore_nort(flags);
4359
4360         drive->dev_flags |= IDE_DFLAG_ID_READ;
4361  #ifdef DEBUG
4362 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4363 index a716693417a3..be0568c722d6 100644
4364 --- a/drivers/ide/ide-taskfile.c
4365 +++ b/drivers/ide/ide-taskfile.c
4366 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4367
4368                 page_is_high = PageHighMem(page);
4369                 if (page_is_high)
4370 -                       local_irq_save(flags);
4371 +                       local_irq_save_nort(flags);
4372
4373                 buf = kmap_atomic(page) + offset;
4374
4375 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4376                 kunmap_atomic(buf);
4377
4378                 if (page_is_high)
4379 -                       local_irq_restore(flags);
4380 +                       local_irq_restore_nort(flags);
4381
4382                 len -= nr_bytes;
4383         }
4384 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4385         }
4386
4387         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4388 -               local_irq_disable();
4389 +               local_irq_disable_nort();
4390
4391         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4392
4393 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4394 index fddff403d5d2..cca1bb4fbfe3 100644
4395 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4396 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4397 @@ -902,7 +902,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4398
4399         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4400
4401 -       local_irq_save(flags);
4402 +       local_irq_save_nort(flags);
4403         netif_addr_lock(dev);
4404         spin_lock(&priv->lock);
4405
4406 @@ -984,7 +984,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4407
4408         spin_unlock(&priv->lock);
4409         netif_addr_unlock(dev);
4410 -       local_irq_restore(flags);
4411 +       local_irq_restore_nort(flags);
4412
4413         /*
4414          * make sure the in-flight joins have finished before we attempt
4415 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4416 index 4a2a9e370be7..e970d9afd179 100644
4417 --- a/drivers/input/gameport/gameport.c
4418 +++ b/drivers/input/gameport/gameport.c
4419 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4420         tx = ~0;
4421
4422         for (i = 0; i < 50; i++) {
4423 -               local_irq_save(flags);
4424 +               local_irq_save_nort(flags);
4425                 t1 = ktime_get_ns();
4426                 for (t = 0; t < 50; t++)
4427                         gameport_read(gameport);
4428                 t2 = ktime_get_ns();
4429                 t3 = ktime_get_ns();
4430 -               local_irq_restore(flags);
4431 +               local_irq_restore_nort(flags);
4432                 udelay(i * 10);
4433                 t = (t2 - t1) - (t3 - t2);
4434                 if (t < tx)
4435 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4436         tx = 1 << 30;
4437
4438         for(i = 0; i < 50; i++) {
4439 -               local_irq_save(flags);
4440 +               local_irq_save_nort(flags);
4441                 GET_TIME(t1);
4442                 for (t = 0; t < 50; t++) gameport_read(gameport);
4443                 GET_TIME(t2);
4444                 GET_TIME(t3);
4445 -               local_irq_restore(flags);
4446 +               local_irq_restore_nort(flags);
4447                 udelay(i * 10);
4448                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4449         }
4450 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4451         tx = 1 << 30;
4452
4453         for(i = 0; i < 50; i++) {
4454 -               local_irq_save(flags);
4455 +               local_irq_save_nort(flags);
4456                 t1 = rdtsc();
4457                 for (t = 0; t < 50; t++) gameport_read(gameport);
4458                 t2 = rdtsc();
4459 -               local_irq_restore(flags);
4460 +               local_irq_restore_nort(flags);
4461                 udelay(i * 10);
4462                 if (t2 - t1 < tx) tx = t2 - t1;
4463         }
4464 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4465 index 11a13b5be73a..baaed0ac274b 100644
4466 --- a/drivers/iommu/amd_iommu.c
4467 +++ b/drivers/iommu/amd_iommu.c
4468 @@ -1923,10 +1923,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4469         int ret;
4470
4471         /*
4472 -        * Must be called with IRQs disabled. Warn here to detect early
4473 -        * when its not.
4474 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4475 +        * detect early when its not.
4476          */
4477 -       WARN_ON(!irqs_disabled());
4478 +       WARN_ON_NONRT(!irqs_disabled());
4479
4480         /* lock domain */
4481         spin_lock(&domain->lock);
4482 @@ -2094,10 +2094,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4483         struct protection_domain *domain;
4484
4485         /*
4486 -        * Must be called with IRQs disabled. Warn here to detect early
4487 -        * when its not.
4488 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4489 +        * detect early when its not.
4490          */
4491 -       WARN_ON(!irqs_disabled());
4492 +       WARN_ON_NONRT(!irqs_disabled());
4493
4494         if (WARN_ON(!dev_data->domain))
4495                 return;
4496 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4497 index d82637ab09fd..ebe41d30c093 100644
4498 --- a/drivers/iommu/intel-iommu.c
4499 +++ b/drivers/iommu/intel-iommu.c
4500 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4501         struct deferred_flush_table *tables;
4502  };
4503
4504 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4505 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4506
4507  /* bitmap for indexing intel_iommus */
4508  static int g_num_of_iommus;
4509 @@ -3715,10 +3715,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4510         struct intel_iommu *iommu;
4511         struct deferred_flush_entry *entry;
4512         struct deferred_flush_data *flush_data;
4513 -       unsigned int cpuid;
4514
4515 -       cpuid = get_cpu();
4516 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4517 +       flush_data = raw_cpu_ptr(&deferred_flush);
4518
4519         /* Flush all CPUs' entries to avoid deferring too much.  If
4520          * this becomes a bottleneck, can just flush us, and rely on
4521 @@ -3751,8 +3749,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4522         }
4523         flush_data->size++;
4524         spin_unlock_irqrestore(&flush_data->lock, flags);
4525 -
4526 -       put_cpu();
4527  }
4528
4529  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4530 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4531 index e23001bfcfee..359d5d169ec0 100644
4532 --- a/drivers/iommu/iova.c
4533 +++ b/drivers/iommu/iova.c
4534 @@ -22,6 +22,7 @@
4535  #include <linux/slab.h>
4536  #include <linux/smp.h>
4537  #include <linux/bitops.h>
4538 +#include <linux/cpu.h>
4539
4540  static bool iova_rcache_insert(struct iova_domain *iovad,
4541                                unsigned long pfn,
4542 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4543
4544                 /* Try replenishing IOVAs by flushing rcache. */
4545                 flushed_rcache = true;
4546 -               preempt_disable();
4547                 for_each_online_cpu(cpu)
4548                         free_cpu_cached_iovas(cpu, iovad);
4549 -               preempt_enable();
4550                 goto retry;
4551         }
4552
4553 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4554         bool can_insert = false;
4555         unsigned long flags;
4556
4557 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4558 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4559         spin_lock_irqsave(&cpu_rcache->lock, flags);
4560
4561         if (!iova_magazine_full(cpu_rcache->loaded)) {
4562 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4563                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4564
4565         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4566 -       put_cpu_ptr(rcache->cpu_rcaches);
4567
4568         if (mag_to_free) {
4569                 iova_magazine_free_pfns(mag_to_free, iovad);
4570 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4571         bool has_pfn = false;
4572         unsigned long flags;
4573
4574 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4575 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4576         spin_lock_irqsave(&cpu_rcache->lock, flags);
4577
4578         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4579 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4580                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4581
4582         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4583 -       put_cpu_ptr(rcache->cpu_rcaches);
4584
4585         return iova_pfn;
4586  }
4587 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4588 index 3f9ddb9fafa7..09da5b6b44a1 100644
4589 --- a/drivers/leds/trigger/Kconfig
4590 +++ b/drivers/leds/trigger/Kconfig
4591 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4592
4593  config LEDS_TRIGGER_CPU
4594         bool "LED CPU Trigger"
4595 -       depends on LEDS_TRIGGERS
4596 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4597         help
4598           This allows LEDs to be controlled by active CPUs. This shows
4599           the active CPUs across an array of LEDs so you can see which
4600 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4601 index 4d200883c505..98b64ed5cb81 100644
4602 --- a/drivers/md/bcache/Kconfig
4603 +++ b/drivers/md/bcache/Kconfig
4604 @@ -1,6 +1,7 @@
4605
4606  config BCACHE
4607         tristate "Block device as cache"
4608 +       depends on !PREEMPT_RT_FULL
4609         ---help---
4610         Allows a block device to be used as cache for other devices; uses
4611         a btree for indexing and the layout is optimized for SSDs.
4612 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4613 index 31a89c8832c0..c3a7e8a9f761 100644
4614 --- a/drivers/md/dm-rq.c
4615 +++ b/drivers/md/dm-rq.c
4616 @@ -838,7 +838,7 @@ static void dm_old_request_fn(struct request_queue *q)
4617                 /* Establish tio->ti before queuing work (map_tio_request) */
4618                 tio->ti = ti;
4619                 kthread_queue_work(&md->kworker, &tio->work);
4620 -               BUG_ON(!irqs_disabled());
4621 +               BUG_ON_NONRT(!irqs_disabled());
4622         }
4623  }
4624
4625 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4626 index cce6057b9aca..fa2c4de32a64 100644
4627 --- a/drivers/md/raid5.c
4628 +++ b/drivers/md/raid5.c
4629 @@ -1928,8 +1928,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4630         struct raid5_percpu *percpu;
4631         unsigned long cpu;
4632
4633 -       cpu = get_cpu();
4634 +       cpu = get_cpu_light();
4635         percpu = per_cpu_ptr(conf->percpu, cpu);
4636 +       spin_lock(&percpu->lock);
4637         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4638                 ops_run_biofill(sh);
4639                 overlap_clear++;
4640 @@ -1985,7 +1986,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4641                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4642                                 wake_up(&sh->raid_conf->wait_for_overlap);
4643                 }
4644 -       put_cpu();
4645 +       spin_unlock(&percpu->lock);
4646 +       put_cpu_light();
4647  }
4648
4649  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4650 @@ -6391,6 +6393,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
4651                        __func__, cpu);
4652                 return -ENOMEM;
4653         }
4654 +       spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4655         return 0;
4656  }
4657
4658 @@ -6401,7 +6404,6 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4659         conf->percpu = alloc_percpu(struct raid5_percpu);
4660         if (!conf->percpu)
4661                 return -ENOMEM;
4662 -
4663         err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
4664         if (!err) {
4665                 conf->scribble_disks = max(conf->raid_disks,
4666 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4667 index 57ec49f0839e..0739604990b7 100644
4668 --- a/drivers/md/raid5.h
4669 +++ b/drivers/md/raid5.h
4670 @@ -504,6 +504,7 @@ struct r5conf {
4671         int                     recovery_disabled;
4672         /* per cpu variables */
4673         struct raid5_percpu {
4674 +               spinlock_t      lock;           /* Protection for -RT */
4675                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4676                 struct flex_array *scribble;   /* space for constructing buffer
4677                                               * lists and performing address
4678 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4679 index 64971baf11fa..215e91e36198 100644
4680 --- a/drivers/misc/Kconfig
4681 +++ b/drivers/misc/Kconfig
4682 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4683  config ATMEL_TCLIB
4684         bool "Atmel AT32/AT91 Timer/Counter Library"
4685         depends on (AVR32 || ARCH_AT91)
4686 +       default y if PREEMPT_RT_FULL
4687         help
4688           Select this if you want a library to allocate the Timer/Counter
4689           blocks found on many Atmel processors.  This facilitates using
4690 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4691           are combined to make a single 32-bit timer.
4692
4693           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4694 -         may be used as a clock event device supporting oneshot mode
4695 -         (delays of up to two seconds) based on the 32 KiHz clock.
4696 +         may be used as a clock event device supporting oneshot mode.
4697
4698  config ATMEL_TCB_CLKSRC_BLOCK
4699         int
4700 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4701           TC can be used for other purposes, such as PWM generation and
4702           interval timing.
4703
4704 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4705 +       bool "TC Block use 32 KiHz clock"
4706 +       depends on ATMEL_TCB_CLKSRC
4707 +       default y if !PREEMPT_RT_FULL
4708 +       help
4709 +         Select this to use 32 KiHz base clock rate as TC block clock
4710 +         source for clock events.
4711 +
4712 +
4713  config DUMMY_IRQ
4714         tristate "Dummy IRQ handler"
4715         default n
4716 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
4717 index df990bb8c873..1a162709a85e 100644
4718 --- a/drivers/mmc/host/mmci.c
4719 +++ b/drivers/mmc/host/mmci.c
4720 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4721         struct sg_mapping_iter *sg_miter = &host->sg_miter;
4722         struct variant_data *variant = host->variant;
4723         void __iomem *base = host->base;
4724 -       unsigned long flags;
4725         u32 status;
4726
4727         status = readl(base + MMCISTATUS);
4728
4729         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
4730
4731 -       local_irq_save(flags);
4732 -
4733         do {
4734                 unsigned int remain, len;
4735                 char *buffer;
4736 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
4737
4738         sg_miter_stop(sg_miter);
4739
4740 -       local_irq_restore(flags);
4741 -
4742         /*
4743          * If we have less than the fifo 'half-full' threshold to transfer,
4744          * trigger a PIO interrupt as soon as any data is available.
4745 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
4746 index 9133e7926da5..63afb921ed40 100644
4747 --- a/drivers/net/ethernet/3com/3c59x.c
4748 +++ b/drivers/net/ethernet/3com/3c59x.c
4749 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
4750  {
4751         struct vortex_private *vp = netdev_priv(dev);
4752         unsigned long flags;
4753 -       local_irq_save(flags);
4754 +       local_irq_save_nort(flags);
4755         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
4756 -       local_irq_restore(flags);
4757 +       local_irq_restore_nort(flags);
4758  }
4759  #endif
4760
4761 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
4762                          * Block interrupts because vortex_interrupt does a bare spin_lock()
4763                          */
4764                         unsigned long flags;
4765 -                       local_irq_save(flags);
4766 +                       local_irq_save_nort(flags);
4767                         if (vp->full_bus_master_tx)
4768                                 boomerang_interrupt(dev->irq, dev);
4769                         else
4770                                 vortex_interrupt(dev->irq, dev);
4771 -                       local_irq_restore(flags);
4772 +                       local_irq_restore_nort(flags);
4773                 }
4774         }
4775
4776 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
4777 index da4c2d8a4173..1420dfb56bac 100644
4778 --- a/drivers/net/ethernet/realtek/8139too.c
4779 +++ b/drivers/net/ethernet/realtek/8139too.c
4780 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
4781         struct rtl8139_private *tp = netdev_priv(dev);
4782         const int irq = tp->pci_dev->irq;
4783
4784 -       disable_irq(irq);
4785 +       disable_irq_nosync(irq);
4786         rtl8139_interrupt(irq, dev);
4787         enable_irq(irq);
4788  }
4789 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4790 index bca6935a94db..d7a35ee34d03 100644
4791 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4792 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
4793 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
4794                         while (!ctx->done.done && msecs--)
4795                                 udelay(1000);
4796                 } else {
4797 -                       wait_event_interruptible(ctx->done.wait,
4798 +                       swait_event_interruptible(ctx->done.wait,
4799                                                  ctx->done.done);
4800                 }
4801                 break;
4802 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
4803 index d11cdbb8fba3..223bbb9acb03 100644
4804 --- a/drivers/pci/access.c
4805 +++ b/drivers/pci/access.c
4806 @@ -672,7 +672,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
4807         WARN_ON(!dev->block_cfg_access);
4808
4809         dev->block_cfg_access = 0;
4810 -       wake_up_all(&pci_cfg_wait);
4811 +       wake_up_all_locked(&pci_cfg_wait);
4812         raw_spin_unlock_irqrestore(&pci_lock, flags);
4813  }
4814  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
4815 diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
4816 index 775c88303017..f8e9e1c2b2f6 100644
4817 --- a/drivers/pinctrl/qcom/pinctrl-msm.c
4818 +++ b/drivers/pinctrl/qcom/pinctrl-msm.c
4819 @@ -61,7 +61,7 @@ struct msm_pinctrl {
4820         struct notifier_block restart_nb;
4821         int irq;
4822
4823 -       spinlock_t lock;
4824 +       raw_spinlock_t lock;
4825
4826         DECLARE_BITMAP(dual_edge_irqs, MAX_NR_GPIO);
4827         DECLARE_BITMAP(enabled_irqs, MAX_NR_GPIO);
4828 @@ -153,14 +153,14 @@ static int msm_pinmux_set_mux(struct pinctrl_dev *pctldev,
4829         if (WARN_ON(i == g->nfuncs))
4830                 return -EINVAL;
4831
4832 -       spin_lock_irqsave(&pctrl->lock, flags);
4833 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4834
4835         val = readl(pctrl->regs + g->ctl_reg);
4836         val &= ~mask;
4837         val |= i << g->mux_bit;
4838         writel(val, pctrl->regs + g->ctl_reg);
4839
4840 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4841 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4842
4843         return 0;
4844  }
4845 @@ -323,14 +323,14 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
4846                         break;
4847                 case PIN_CONFIG_OUTPUT:
4848                         /* set output value */
4849 -                       spin_lock_irqsave(&pctrl->lock, flags);
4850 +                       raw_spin_lock_irqsave(&pctrl->lock, flags);
4851                         val = readl(pctrl->regs + g->io_reg);
4852                         if (arg)
4853                                 val |= BIT(g->out_bit);
4854                         else
4855                                 val &= ~BIT(g->out_bit);
4856                         writel(val, pctrl->regs + g->io_reg);
4857 -                       spin_unlock_irqrestore(&pctrl->lock, flags);
4858 +                       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4859
4860                         /* enable output */
4861                         arg = 1;
4862 @@ -351,12 +351,12 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev,
4863                         return -EINVAL;
4864                 }
4865
4866 -               spin_lock_irqsave(&pctrl->lock, flags);
4867 +               raw_spin_lock_irqsave(&pctrl->lock, flags);
4868                 val = readl(pctrl->regs + g->ctl_reg);
4869                 val &= ~(mask << bit);
4870                 val |= arg << bit;
4871                 writel(val, pctrl->regs + g->ctl_reg);
4872 -               spin_unlock_irqrestore(&pctrl->lock, flags);
4873 +               raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4874         }
4875
4876         return 0;
4877 @@ -384,13 +384,13 @@ static int msm_gpio_direction_input(struct gpio_chip *chip, unsigned offset)
4878
4879         g = &pctrl->soc->groups[offset];
4880
4881 -       spin_lock_irqsave(&pctrl->lock, flags);
4882 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4883
4884         val = readl(pctrl->regs + g->ctl_reg);
4885         val &= ~BIT(g->oe_bit);
4886         writel(val, pctrl->regs + g->ctl_reg);
4887
4888 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4889 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4890
4891         return 0;
4892  }
4893 @@ -404,7 +404,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
4894
4895         g = &pctrl->soc->groups[offset];
4896
4897 -       spin_lock_irqsave(&pctrl->lock, flags);
4898 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4899
4900         val = readl(pctrl->regs + g->io_reg);
4901         if (value)
4902 @@ -417,7 +417,7 @@ static int msm_gpio_direction_output(struct gpio_chip *chip, unsigned offset, in
4903         val |= BIT(g->oe_bit);
4904         writel(val, pctrl->regs + g->ctl_reg);
4905
4906 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4907 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4908
4909         return 0;
4910  }
4911 @@ -443,7 +443,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
4912
4913         g = &pctrl->soc->groups[offset];
4914
4915 -       spin_lock_irqsave(&pctrl->lock, flags);
4916 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4917
4918         val = readl(pctrl->regs + g->io_reg);
4919         if (value)
4920 @@ -452,7 +452,7 @@ static void msm_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
4921                 val &= ~BIT(g->out_bit);
4922         writel(val, pctrl->regs + g->io_reg);
4923
4924 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4925 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4926  }
4927
4928  #ifdef CONFIG_DEBUG_FS
4929 @@ -571,7 +571,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
4930
4931         g = &pctrl->soc->groups[d->hwirq];
4932
4933 -       spin_lock_irqsave(&pctrl->lock, flags);
4934 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4935
4936         val = readl(pctrl->regs + g->intr_cfg_reg);
4937         val &= ~BIT(g->intr_enable_bit);
4938 @@ -579,7 +579,7 @@ static void msm_gpio_irq_mask(struct irq_data *d)
4939
4940         clear_bit(d->hwirq, pctrl->enabled_irqs);
4941
4942 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4943 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4944  }
4945
4946  static void msm_gpio_irq_unmask(struct irq_data *d)
4947 @@ -592,7 +592,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
4948
4949         g = &pctrl->soc->groups[d->hwirq];
4950
4951 -       spin_lock_irqsave(&pctrl->lock, flags);
4952 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4953
4954         val = readl(pctrl->regs + g->intr_status_reg);
4955         val &= ~BIT(g->intr_status_bit);
4956 @@ -604,7 +604,7 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
4957
4958         set_bit(d->hwirq, pctrl->enabled_irqs);
4959
4960 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4961 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4962  }
4963
4964  static void msm_gpio_irq_ack(struct irq_data *d)
4965 @@ -617,7 +617,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
4966
4967         g = &pctrl->soc->groups[d->hwirq];
4968
4969 -       spin_lock_irqsave(&pctrl->lock, flags);
4970 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4971
4972         val = readl(pctrl->regs + g->intr_status_reg);
4973         if (g->intr_ack_high)
4974 @@ -629,7 +629,7 @@ static void msm_gpio_irq_ack(struct irq_data *d)
4975         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
4976                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
4977
4978 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4979 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4980  }
4981
4982  static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4983 @@ -642,7 +642,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4984
4985         g = &pctrl->soc->groups[d->hwirq];
4986
4987 -       spin_lock_irqsave(&pctrl->lock, flags);
4988 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
4989
4990         /*
4991          * For hw without possibility of detecting both edges
4992 @@ -716,7 +716,7 @@ static int msm_gpio_irq_set_type(struct irq_data *d, unsigned int type)
4993         if (test_bit(d->hwirq, pctrl->dual_edge_irqs))
4994                 msm_gpio_update_dual_edge_pos(pctrl, g, d);
4995
4996 -       spin_unlock_irqrestore(&pctrl->lock, flags);
4997 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
4998
4999         if (type & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
5000                 irq_set_handler_locked(d, handle_level_irq);
5001 @@ -732,11 +732,11 @@ static int msm_gpio_irq_set_wake(struct irq_data *d, unsigned int on)
5002         struct msm_pinctrl *pctrl = gpiochip_get_data(gc);
5003         unsigned long flags;
5004
5005 -       spin_lock_irqsave(&pctrl->lock, flags);
5006 +       raw_spin_lock_irqsave(&pctrl->lock, flags);
5007
5008         irq_set_irq_wake(pctrl->irq, on);
5009
5010 -       spin_unlock_irqrestore(&pctrl->lock, flags);
5011 +       raw_spin_unlock_irqrestore(&pctrl->lock, flags);
5012
5013         return 0;
5014  }
5015 @@ -882,7 +882,7 @@ int msm_pinctrl_probe(struct platform_device *pdev,
5016         pctrl->soc = soc_data;
5017         pctrl->chip = msm_gpio_template;
5018
5019 -       spin_lock_init(&pctrl->lock);
5020 +       raw_spin_lock_init(&pctrl->lock);
5021
5022         res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
5023         pctrl->regs = devm_ioremap_resource(&pdev->dev, res);
5024 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
5025 index 9bd41a35a78a..8e2d436c2e3f 100644
5026 --- a/drivers/scsi/fcoe/fcoe.c
5027 +++ b/drivers/scsi/fcoe/fcoe.c
5028 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
5029  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
5030  {
5031         struct fcoe_percpu_s *fps;
5032 -       int rc;
5033 +       int rc, cpu = get_cpu_light();
5034
5035 -       fps = &get_cpu_var(fcoe_percpu);
5036 +       fps = &per_cpu(fcoe_percpu, cpu);
5037         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
5038 -       put_cpu_var(fcoe_percpu);
5039 +       put_cpu_light();
5040
5041         return rc;
5042  }
5043 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
5044                 return 0;
5045         }
5046
5047 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5048 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5049         stats->InvalidCRCCount++;
5050         if (stats->InvalidCRCCount < 5)
5051                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
5052 -       put_cpu();
5053 +       put_cpu_light();
5054         return -EINVAL;
5055  }
5056
5057 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5058          */
5059         hp = (struct fcoe_hdr *) skb_network_header(skb);
5060
5061 -       stats = per_cpu_ptr(lport->stats, get_cpu());
5062 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
5063         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
5064                 if (stats->ErrorFrames < 5)
5065                         printk(KERN_WARNING "fcoe: FCoE version "
5066 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
5067                 goto drop;
5068
5069         if (!fcoe_filter_frames(lport, fp)) {
5070 -               put_cpu();
5071 +               put_cpu_light();
5072                 fc_exch_recv(lport, fp);
5073                 return;
5074         }
5075  drop:
5076         stats->ErrorFrames++;
5077 -       put_cpu();
5078 +       put_cpu_light();
5079         kfree_skb(skb);
5080  }
5081
5082 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
5083 index dcf36537a767..1a1f2e46452c 100644
5084 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
5085 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
5086 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5087
5088         INIT_LIST_HEAD(&del_list);
5089
5090 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
5091 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
5092
5093         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
5094                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
5095 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
5096                                 sel_time = fcf->time;
5097                 }
5098         }
5099 -       put_cpu();
5100 +       put_cpu_light();
5101
5102         list_for_each_entry_safe(fcf, next, &del_list, list) {
5103                 /* Removes fcf from current list */
5104 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
5105 index 16ca31ad5ec0..c3987347e762 100644
5106 --- a/drivers/scsi/libfc/fc_exch.c
5107 +++ b/drivers/scsi/libfc/fc_exch.c
5108 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
5109         }
5110         memset(ep, 0, sizeof(*ep));
5111
5112 -       cpu = get_cpu();
5113 +       cpu = get_cpu_light();
5114         pool = per_cpu_ptr(mp->pool, cpu);
5115         spin_lock_bh(&pool->lock);
5116 -       put_cpu();
5117 +       put_cpu_light();
5118
5119         /* peek cache of free slot */
5120         if (pool->left != FC_XID_UNKNOWN) {
5121 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
5122 index 763f012fdeca..d0f61b595470 100644
5123 --- a/drivers/scsi/libsas/sas_ata.c
5124 +++ b/drivers/scsi/libsas/sas_ata.c
5125 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5126         /* TODO: audit callers to ensure they are ready for qc_issue to
5127          * unconditionally re-enable interrupts
5128          */
5129 -       local_irq_save(flags);
5130 +       local_irq_save_nort(flags);
5131         spin_unlock(ap->lock);
5132
5133         /* If the device fell off, no sense in issuing commands */
5134 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
5135
5136   out:
5137         spin_lock(ap->lock);
5138 -       local_irq_restore(flags);
5139 +       local_irq_restore_nort(flags);
5140         return ret;
5141  }
5142
5143 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
5144 index edc48f3b8230..ee5c6f9dfb6f 100644
5145 --- a/drivers/scsi/qla2xxx/qla_inline.h
5146 +++ b/drivers/scsi/qla2xxx/qla_inline.h
5147 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
5148  {
5149         unsigned long flags;
5150         struct qla_hw_data *ha = rsp->hw;
5151 -       local_irq_save(flags);
5152 +       local_irq_save_nort(flags);
5153         if (IS_P3P_TYPE(ha))
5154                 qla82xx_poll(0, rsp);
5155         else
5156                 ha->isp_ops->intr_handler(0, rsp);
5157 -       local_irq_restore(flags);
5158 +       local_irq_restore_nort(flags);
5159  }
5160
5161  static inline uint8_t *
5162 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
5163 index 068c4e47fac9..a2090f640397 100644
5164 --- a/drivers/scsi/qla2xxx/qla_isr.c
5165 +++ b/drivers/scsi/qla2xxx/qla_isr.c
5166 @@ -3125,7 +3125,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
5167                 * kref_put().
5168                 */
5169                 kref_get(&qentry->irq_notify.kref);
5170 +#ifdef CONFIG_PREEMPT_RT_BASE
5171 +               swork_queue(&qentry->irq_notify.swork);
5172 +#else
5173                 schedule_work(&qentry->irq_notify.work);
5174 +#endif
5175         }
5176
5177         /*
5178 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
5179 index 95f4c1bcdb4c..0be934799bff 100644
5180 --- a/drivers/thermal/x86_pkg_temp_thermal.c
5181 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
5182 @@ -29,6 +29,7 @@
5183  #include <linux/pm.h>
5184  #include <linux/thermal.h>
5185  #include <linux/debugfs.h>
5186 +#include <linux/swork.h>
5187  #include <asm/cpu_device_id.h>
5188  #include <asm/mce.h>
5189
5190 @@ -353,7 +354,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
5191         }
5192  }
5193
5194 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5195 +static void platform_thermal_notify_work(struct swork_event *event)
5196  {
5197         unsigned long flags;
5198         int cpu = smp_processor_id();
5199 @@ -370,7 +371,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5200                         pkg_work_scheduled[phy_id]) {
5201                 disable_pkg_thres_interrupt();
5202                 spin_unlock_irqrestore(&pkg_work_lock, flags);
5203 -               return -EINVAL;
5204 +               return;
5205         }
5206         pkg_work_scheduled[phy_id] = 1;
5207         spin_unlock_irqrestore(&pkg_work_lock, flags);
5208 @@ -379,9 +380,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5209         schedule_delayed_work_on(cpu,
5210                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
5211                                 msecs_to_jiffies(notify_delay_ms));
5212 +}
5213 +
5214 +#ifdef CONFIG_PREEMPT_RT_FULL
5215 +static struct swork_event notify_work;
5216 +
5217 +static int thermal_notify_work_init(void)
5218 +{
5219 +       int err;
5220 +
5221 +       err = swork_get();
5222 +       if (err)
5223 +               return err;
5224 +
5225 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
5226         return 0;
5227  }
5228
5229 +static void thermal_notify_work_cleanup(void)
5230 +{
5231 +       swork_put();
5232 +}
5233 +
5234 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5235 +{
5236 +       swork_queue(&notify_work);
5237 +       return 0;
5238 +}
5239 +
5240 +#else  /* !CONFIG_PREEMPT_RT_FULL */
5241 +
5242 +static int thermal_notify_work_init(void) { return 0; }
5243 +
5244 +static void thermal_notify_work_cleanup(void) {  }
5245 +
5246 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
5247 +{
5248 +       platform_thermal_notify_work(NULL);
5249 +
5250 +       return 0;
5251 +}
5252 +#endif /* CONFIG_PREEMPT_RT_FULL */
5253 +
5254  static int find_siblings_cpu(int cpu)
5255  {
5256         int i;
5257 @@ -585,6 +625,9 @@ static int __init pkg_temp_thermal_init(void)
5258         if (!x86_match_cpu(pkg_temp_thermal_ids))
5259                 return -ENODEV;
5260
5261 +       if (!thermal_notify_work_init())
5262 +               return -ENODEV;
5263 +
5264         spin_lock_init(&pkg_work_lock);
5265         platform_thermal_package_notify =
5266                         pkg_temp_thermal_platform_thermal_notify;
5267 @@ -609,7 +652,7 @@ static int __init pkg_temp_thermal_init(void)
5268         kfree(pkg_work_scheduled);
5269         platform_thermal_package_notify = NULL;
5270         platform_thermal_package_rate_control = NULL;
5271 -
5272 +       thermal_notify_work_cleanup();
5273         return -ENODEV;
5274  }
5275
5276 @@ -634,6 +677,7 @@ static void __exit pkg_temp_thermal_exit(void)
5277         mutex_unlock(&phy_dev_list_mutex);
5278         platform_thermal_package_notify = NULL;
5279         platform_thermal_package_rate_control = NULL;
5280 +       thermal_notify_work_cleanup();
5281         for_each_online_cpu(i)
5282                 cancel_delayed_work_sync(
5283                         &per_cpu(pkg_temp_thermal_threshold_work, i));
5284 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
5285 index e8819aa20415..dd7f9bf45d6c 100644
5286 --- a/drivers/tty/serial/8250/8250_core.c
5287 +++ b/drivers/tty/serial/8250/8250_core.c
5288 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
5289
5290  static unsigned int skip_txen_test; /* force skip of txen test at init time */
5291
5292 -#define PASS_LIMIT     512
5293 +/*
5294 + * On -rt we can have a more delays, and legitimately
5295 + * so - so don't drop work spuriously and spam the
5296 + * syslog:
5297 + */
5298 +#ifdef CONFIG_PREEMPT_RT_FULL
5299 +# define PASS_LIMIT    1000000
5300 +#else
5301 +# define PASS_LIMIT    512
5302 +#endif
5303
5304  #include <asm/serial.h>
5305  /*
5306 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
5307 index 080d5a59d0a7..eecc4f111473 100644
5308 --- a/drivers/tty/serial/8250/8250_port.c
5309 +++ b/drivers/tty/serial/8250/8250_port.c
5310 @@ -35,6 +35,7 @@
5311  #include <linux/nmi.h>
5312  #include <linux/mutex.h>
5313  #include <linux/slab.h>
5314 +#include <linux/kdb.h>
5315  #include <linux/uaccess.h>
5316  #include <linux/pm_runtime.h>
5317  #include <linux/timer.h>
5318 @@ -3144,9 +3145,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
5319
5320         serial8250_rpm_get(up);
5321
5322 -       if (port->sysrq)
5323 +       if (port->sysrq || oops_in_progress)
5324                 locked = 0;
5325 -       else if (oops_in_progress)
5326 +       else if (in_kdb_printk())
5327                 locked = spin_trylock_irqsave(&port->lock, flags);
5328         else
5329                 spin_lock_irqsave(&port->lock, flags);
5330 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
5331 index e2c33b9528d8..53af53c43e8c 100644
5332 --- a/drivers/tty/serial/amba-pl011.c
5333 +++ b/drivers/tty/serial/amba-pl011.c
5334 @@ -2194,13 +2194,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5335
5336         clk_enable(uap->clk);
5337
5338 -       local_irq_save(flags);
5339 +       /*
5340 +        * local_irq_save(flags);
5341 +        *
5342 +        * This local_irq_save() is nonsense. If we come in via sysrq
5343 +        * handling then interrupts are already disabled. Aside of
5344 +        * that the port.sysrq check is racy on SMP regardless.
5345 +       */
5346         if (uap->port.sysrq)
5347                 locked = 0;
5348         else if (oops_in_progress)
5349 -               locked = spin_trylock(&uap->port.lock);
5350 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
5351         else
5352 -               spin_lock(&uap->port.lock);
5353 +               spin_lock_irqsave(&uap->port.lock, flags);
5354
5355         /*
5356          *      First save the CR then disable the interrupts
5357 @@ -2224,8 +2230,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
5358                 pl011_write(old_cr, uap, REG_CR);
5359
5360         if (locked)
5361 -               spin_unlock(&uap->port.lock);
5362 -       local_irq_restore(flags);
5363 +               spin_unlock_irqrestore(&uap->port.lock, flags);
5364
5365         clk_disable(uap->clk);
5366  }
5367 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
5368 index a2a529994ba5..0ee7c4c518df 100644
5369 --- a/drivers/tty/serial/omap-serial.c
5370 +++ b/drivers/tty/serial/omap-serial.c
5371 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
5372
5373         pm_runtime_get_sync(up->dev);
5374
5375 -       local_irq_save(flags);
5376 -       if (up->port.sysrq)
5377 -               locked = 0;
5378 -       else if (oops_in_progress)
5379 -               locked = spin_trylock(&up->port.lock);
5380 +       if (up->port.sysrq || oops_in_progress)
5381 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
5382         else
5383 -               spin_lock(&up->port.lock);
5384 +               spin_lock_irqsave(&up->port.lock, flags);
5385
5386         /*
5387          * First save the IER then disable the interrupts
5388 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
5389         pm_runtime_mark_last_busy(up->dev);
5390         pm_runtime_put_autosuspend(up->dev);
5391         if (locked)
5392 -               spin_unlock(&up->port.lock);
5393 -       local_irq_restore(flags);
5394 +               spin_unlock_irqrestore(&up->port.lock, flags);
5395  }
5396
5397  static int __init
5398 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
5399 index 479e223f9cff..3418a54b4131 100644
5400 --- a/drivers/usb/core/hcd.c
5401 +++ b/drivers/usb/core/hcd.c
5402 @@ -1761,9 +1761,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
5403          * and no one may trigger the above deadlock situation when
5404          * running complete() in tasklet.
5405          */
5406 -       local_irq_save(flags);
5407 +       local_irq_save_nort(flags);
5408         urb->complete(urb);
5409 -       local_irq_restore(flags);
5410 +       local_irq_restore_nort(flags);
5411
5412         usb_anchor_resume_wakeups(anchor);
5413         atomic_dec(&urb->use_count);
5414 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
5415 index 17989b72cdae..88c6574b5992 100644
5416 --- a/drivers/usb/gadget/function/f_fs.c
5417 +++ b/drivers/usb/gadget/function/f_fs.c
5418 @@ -1593,7 +1593,7 @@ static void ffs_data_put(struct ffs_data *ffs)
5419                 pr_info("%s(): freeing\n", __func__);
5420                 ffs_data_clear(ffs);
5421                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
5422 -                      waitqueue_active(&ffs->ep0req_completion.wait));
5423 +                      swait_active(&ffs->ep0req_completion.wait));
5424                 kfree(ffs->dev_name);
5425                 kfree(ffs);
5426         }
5427 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
5428 index 1468d8f085a3..6aae3ae25c18 100644
5429 --- a/drivers/usb/gadget/legacy/inode.c
5430 +++ b/drivers/usb/gadget/legacy/inode.c
5431 @@ -346,7 +346,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5432         spin_unlock_irq (&epdata->dev->lock);
5433
5434         if (likely (value == 0)) {
5435 -               value = wait_event_interruptible (done.wait, done.done);
5436 +               value = swait_event_interruptible (done.wait, done.done);
5437                 if (value != 0) {
5438                         spin_lock_irq (&epdata->dev->lock);
5439                         if (likely (epdata->ep != NULL)) {
5440 @@ -355,7 +355,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
5441                                 usb_ep_dequeue (epdata->ep, epdata->req);
5442                                 spin_unlock_irq (&epdata->dev->lock);
5443
5444 -                               wait_event (done.wait, done.done);
5445 +                               swait_event (done.wait, done.done);
5446                                 if (epdata->status == -ECONNRESET)
5447                                         epdata->status = -EINTR;
5448                         } else {
5449 diff --git a/fs/aio.c b/fs/aio.c
5450 index 428484f2f841..2b02e2eb2158 100644
5451 --- a/fs/aio.c
5452 +++ b/fs/aio.c
5453 @@ -40,6 +40,7 @@
5454  #include <linux/ramfs.h>
5455  #include <linux/percpu-refcount.h>
5456  #include <linux/mount.h>
5457 +#include <linux/swork.h>
5458
5459  #include <asm/kmap_types.h>
5460  #include <asm/uaccess.h>
5461 @@ -115,7 +116,7 @@ struct kioctx {
5462         struct page             **ring_pages;
5463         long                    nr_pages;
5464
5465 -       struct work_struct      free_work;
5466 +       struct swork_event      free_work;
5467
5468         /*
5469          * signals when all in-flight requests are done
5470 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
5471                 .mount          = aio_mount,
5472                 .kill_sb        = kill_anon_super,
5473         };
5474 +       BUG_ON(swork_get());
5475         aio_mnt = kern_mount(&aio_fs);
5476         if (IS_ERR(aio_mnt))
5477                 panic("Failed to create aio fs mount.");
5478 @@ -581,9 +583,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
5479         return cancel(&kiocb->common);
5480  }
5481
5482 -static void free_ioctx(struct work_struct *work)
5483 +static void free_ioctx(struct swork_event *sev)
5484  {
5485 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
5486 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5487
5488         pr_debug("freeing %p\n", ctx);
5489
5490 @@ -602,8 +604,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5491         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
5492                 complete(&ctx->rq_wait->comp);
5493
5494 -       INIT_WORK(&ctx->free_work, free_ioctx);
5495 -       schedule_work(&ctx->free_work);
5496 +       INIT_SWORK(&ctx->free_work, free_ioctx);
5497 +       swork_queue(&ctx->free_work);
5498  }
5499
5500  /*
5501 @@ -611,9 +613,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
5502   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
5503   * now it's safe to cancel any that need to be.
5504   */
5505 -static void free_ioctx_users(struct percpu_ref *ref)
5506 +static void free_ioctx_users_work(struct swork_event *sev)
5507  {
5508 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5509 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
5510         struct aio_kiocb *req;
5511
5512         spin_lock_irq(&ctx->ctx_lock);
5513 @@ -632,6 +634,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
5514         percpu_ref_put(&ctx->reqs);
5515  }
5516
5517 +static void free_ioctx_users(struct percpu_ref *ref)
5518 +{
5519 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
5520 +
5521 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
5522 +       swork_queue(&ctx->free_work);
5523 +}
5524 +
5525  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
5526  {
5527         unsigned i, new_nr;
5528 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
5529 index a1fba4285277..3796769b4cd1 100644
5530 --- a/fs/autofs4/autofs_i.h
5531 +++ b/fs/autofs4/autofs_i.h
5532 @@ -31,6 +31,7 @@
5533  #include <linux/sched.h>
5534  #include <linux/mount.h>
5535  #include <linux/namei.h>
5536 +#include <linux/delay.h>
5537  #include <asm/current.h>
5538  #include <linux/uaccess.h>
5539
5540 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
5541 index d8e6d421c27f..2e689ab1306b 100644
5542 --- a/fs/autofs4/expire.c
5543 +++ b/fs/autofs4/expire.c
5544 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
5545                         parent = p->d_parent;
5546                         if (!spin_trylock(&parent->d_lock)) {
5547                                 spin_unlock(&p->d_lock);
5548 -                               cpu_relax();
5549 +                               cpu_chill();
5550                                 goto relock;
5551                         }
5552                         spin_unlock(&p->d_lock);
5553 diff --git a/fs/buffer.c b/fs/buffer.c
5554 index b205a629001d..5646afc022ba 100644
5555 --- a/fs/buffer.c
5556 +++ b/fs/buffer.c
5557 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5558          * decide that the page is now completely done.
5559          */
5560         first = page_buffers(page);
5561 -       local_irq_save(flags);
5562 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5563 +       flags = bh_uptodate_lock_irqsave(first);
5564         clear_buffer_async_read(bh);
5565         unlock_buffer(bh);
5566         tmp = bh;
5567 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5568                 }
5569                 tmp = tmp->b_this_page;
5570         } while (tmp != bh);
5571 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5572 -       local_irq_restore(flags);
5573 +       bh_uptodate_unlock_irqrestore(first, flags);
5574
5575         /*
5576          * If none of the buffers had errors and they are all
5577 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
5578         return;
5579
5580  still_busy:
5581 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5582 -       local_irq_restore(flags);
5583 -       return;
5584 +       bh_uptodate_unlock_irqrestore(first, flags);
5585  }
5586
5587  /*
5588 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5589         }
5590
5591         first = page_buffers(page);
5592 -       local_irq_save(flags);
5593 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
5594 +       flags = bh_uptodate_lock_irqsave(first);
5595
5596         clear_buffer_async_write(bh);
5597         unlock_buffer(bh);
5598 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
5599                 }
5600                 tmp = tmp->b_this_page;
5601         }
5602 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5603 -       local_irq_restore(flags);
5604 +       bh_uptodate_unlock_irqrestore(first, flags);
5605         end_page_writeback(page);
5606         return;
5607
5608  still_busy:
5609 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
5610 -       local_irq_restore(flags);
5611 -       return;
5612 +       bh_uptodate_unlock_irqrestore(first, flags);
5613  }
5614  EXPORT_SYMBOL(end_buffer_async_write);
5615
5616 @@ -3383,6 +3375,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
5617         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
5618         if (ret) {
5619                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
5620 +               buffer_head_init_locks(ret);
5621                 preempt_disable();
5622                 __this_cpu_inc(bh_accounting.nr);
5623                 recalc_bh_state();
5624 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
5625 index 8f6a2a5863b9..4217828d0b68 100644
5626 --- a/fs/cifs/readdir.c
5627 +++ b/fs/cifs/readdir.c
5628 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
5629         struct inode *inode;
5630         struct super_block *sb = parent->d_sb;
5631         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
5632 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5633 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5634
5635         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
5636
5637 diff --git a/fs/dcache.c b/fs/dcache.c
5638 index 4485a48f4091..691039a6a872 100644
5639 --- a/fs/dcache.c
5640 +++ b/fs/dcache.c
5641 @@ -19,6 +19,7 @@
5642  #include <linux/mm.h>
5643  #include <linux/fs.h>
5644  #include <linux/fsnotify.h>
5645 +#include <linux/delay.h>
5646  #include <linux/slab.h>
5647  #include <linux/init.h>
5648  #include <linux/hash.h>
5649 @@ -750,6 +751,8 @@ static inline bool fast_dput(struct dentry *dentry)
5650   */
5651  void dput(struct dentry *dentry)
5652  {
5653 +       struct dentry *parent;
5654 +
5655         if (unlikely(!dentry))
5656                 return;
5657
5658 @@ -788,9 +791,18 @@ void dput(struct dentry *dentry)
5659         return;
5660
5661  kill_it:
5662 -       dentry = dentry_kill(dentry);
5663 -       if (dentry) {
5664 -               cond_resched();
5665 +       parent = dentry_kill(dentry);
5666 +       if (parent) {
5667 +               int r;
5668 +
5669 +               if (parent == dentry) {
5670 +                       /* the task with the highest priority won't schedule */
5671 +                       r = cond_resched();
5672 +                       if (!r)
5673 +                               cpu_chill();
5674 +               } else {
5675 +                       dentry = parent;
5676 +               }
5677                 goto repeat;
5678         }
5679  }
5680 @@ -2324,7 +2336,7 @@ void d_delete(struct dentry * dentry)
5681         if (dentry->d_lockref.count == 1) {
5682                 if (!spin_trylock(&inode->i_lock)) {
5683                         spin_unlock(&dentry->d_lock);
5684 -                       cpu_relax();
5685 +                       cpu_chill();
5686                         goto again;
5687                 }
5688                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
5689 @@ -2384,21 +2396,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
5690
5691  static void d_wait_lookup(struct dentry *dentry)
5692  {
5693 -       if (d_in_lookup(dentry)) {
5694 -               DECLARE_WAITQUEUE(wait, current);
5695 -               add_wait_queue(dentry->d_wait, &wait);
5696 -               do {
5697 -                       set_current_state(TASK_UNINTERRUPTIBLE);
5698 -                       spin_unlock(&dentry->d_lock);
5699 -                       schedule();
5700 -                       spin_lock(&dentry->d_lock);
5701 -               } while (d_in_lookup(dentry));
5702 -       }
5703 +       struct swait_queue __wait;
5704 +
5705 +       if (!d_in_lookup(dentry))
5706 +               return;
5707 +
5708 +       INIT_LIST_HEAD(&__wait.task_list);
5709 +       do {
5710 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
5711 +               spin_unlock(&dentry->d_lock);
5712 +               schedule();
5713 +               spin_lock(&dentry->d_lock);
5714 +       } while (d_in_lookup(dentry));
5715 +       finish_swait(dentry->d_wait, &__wait);
5716  }
5717
5718  struct dentry *d_alloc_parallel(struct dentry *parent,
5719                                 const struct qstr *name,
5720 -                               wait_queue_head_t *wq)
5721 +                               struct swait_queue_head *wq)
5722  {
5723         unsigned int hash = name->hash;
5724         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
5725 @@ -2507,7 +2522,7 @@ void __d_lookup_done(struct dentry *dentry)
5726         hlist_bl_lock(b);
5727         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
5728         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
5729 -       wake_up_all(dentry->d_wait);
5730 +       swake_up_all(dentry->d_wait);
5731         dentry->d_wait = NULL;
5732         hlist_bl_unlock(b);
5733         INIT_HLIST_NODE(&dentry->d_u.d_alias);
5734 @@ -3604,6 +3619,11 @@ EXPORT_SYMBOL(d_genocide);
5735
5736  void __init vfs_caches_init_early(void)
5737  {
5738 +       int i;
5739 +
5740 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
5741 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
5742 +
5743         dcache_init_early();
5744         inode_init_early();
5745  }
5746 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
5747 index 10db91218933..42af0a06f657 100644
5748 --- a/fs/eventpoll.c
5749 +++ b/fs/eventpoll.c
5750 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
5751   */
5752  static void ep_poll_safewake(wait_queue_head_t *wq)
5753  {
5754 -       int this_cpu = get_cpu();
5755 +       int this_cpu = get_cpu_light();
5756
5757         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
5758                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
5759
5760 -       put_cpu();
5761 +       put_cpu_light();
5762  }
5763
5764  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
5765 diff --git a/fs/exec.c b/fs/exec.c
5766 index 67e86571685a..fe14cdd84016 100644
5767 --- a/fs/exec.c
5768 +++ b/fs/exec.c
5769 @@ -1017,12 +1017,14 @@ static int exec_mmap(struct mm_struct *mm)
5770                 }
5771         }
5772         task_lock(tsk);
5773 +       preempt_disable_rt();
5774         active_mm = tsk->active_mm;
5775         tsk->mm = mm;
5776         tsk->active_mm = mm;
5777         activate_mm(active_mm, mm);
5778         tsk->mm->vmacache_seqnum = 0;
5779         vmacache_flush(tsk);
5780 +       preempt_enable_rt();
5781         task_unlock(tsk);
5782         if (old_mm) {
5783                 up_read(&old_mm->mmap_sem);
5784 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
5785 index 642c57b8de7b..8494b9308333 100644
5786 --- a/fs/fuse/dir.c
5787 +++ b/fs/fuse/dir.c
5788 @@ -1191,7 +1191,7 @@ static int fuse_direntplus_link(struct file *file,
5789         struct inode *dir = d_inode(parent);
5790         struct fuse_conn *fc;
5791         struct inode *inode;
5792 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5793 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5794
5795         if (!o->nodeid) {
5796                 /*
5797 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
5798 index 684996c8a3a4..6e18a06aaabe 100644
5799 --- a/fs/jbd2/checkpoint.c
5800 +++ b/fs/jbd2/checkpoint.c
5801 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
5802         nblocks = jbd2_space_needed(journal);
5803         while (jbd2_log_space_left(journal) < nblocks) {
5804                 write_unlock(&journal->j_state_lock);
5805 +               if (current->plug)
5806 +                       io_schedule();
5807                 mutex_lock(&journal->j_checkpoint_mutex);
5808
5809                 /*
5810 diff --git a/fs/locks.c b/fs/locks.c
5811 index 22c5b4aa4961..269c6a44449a 100644
5812 --- a/fs/locks.c
5813 +++ b/fs/locks.c
5814 @@ -935,7 +935,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5815                         return -ENOMEM;
5816         }
5817
5818 -       percpu_down_read_preempt_disable(&file_rwsem);
5819 +       percpu_down_read(&file_rwsem);
5820         spin_lock(&ctx->flc_lock);
5821         if (request->fl_flags & FL_ACCESS)
5822                 goto find_conflict;
5823 @@ -976,7 +976,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
5824
5825  out:
5826         spin_unlock(&ctx->flc_lock);
5827 -       percpu_up_read_preempt_enable(&file_rwsem);
5828 +       percpu_up_read(&file_rwsem);
5829         if (new_fl)
5830                 locks_free_lock(new_fl);
5831         locks_dispose_list(&dispose);
5832 @@ -1013,7 +1013,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5833                 new_fl2 = locks_alloc_lock();
5834         }
5835
5836 -       percpu_down_read_preempt_disable(&file_rwsem);
5837 +       percpu_down_read(&file_rwsem);
5838         spin_lock(&ctx->flc_lock);
5839         /*
5840          * New lock request. Walk all POSIX locks and look for conflicts. If
5841 @@ -1185,7 +1185,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
5842         }
5843   out:
5844         spin_unlock(&ctx->flc_lock);
5845 -       percpu_up_read_preempt_enable(&file_rwsem);
5846 +       percpu_up_read(&file_rwsem);
5847         /*
5848          * Free any unused locks.
5849          */
5850 @@ -1460,7 +1460,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5851                 return error;
5852         }
5853
5854 -       percpu_down_read_preempt_disable(&file_rwsem);
5855 +       percpu_down_read(&file_rwsem);
5856         spin_lock(&ctx->flc_lock);
5857
5858         time_out_leases(inode, &dispose);
5859 @@ -1512,13 +1512,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5860         locks_insert_block(fl, new_fl);
5861         trace_break_lease_block(inode, new_fl);
5862         spin_unlock(&ctx->flc_lock);
5863 -       percpu_up_read_preempt_enable(&file_rwsem);
5864 +       percpu_up_read(&file_rwsem);
5865
5866         locks_dispose_list(&dispose);
5867         error = wait_event_interruptible_timeout(new_fl->fl_wait,
5868                                                 !new_fl->fl_next, break_time);
5869
5870 -       percpu_down_read_preempt_disable(&file_rwsem);
5871 +       percpu_down_read(&file_rwsem);
5872         spin_lock(&ctx->flc_lock);
5873         trace_break_lease_unblock(inode, new_fl);
5874         locks_delete_block(new_fl);
5875 @@ -1535,7 +1535,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
5876         }
5877  out:
5878         spin_unlock(&ctx->flc_lock);
5879 -       percpu_up_read_preempt_enable(&file_rwsem);
5880 +       percpu_up_read(&file_rwsem);
5881         locks_dispose_list(&dispose);
5882         locks_free_lock(new_fl);
5883         return error;
5884 @@ -1609,7 +1609,7 @@ int fcntl_getlease(struct file *filp)
5885
5886         ctx = smp_load_acquire(&inode->i_flctx);
5887         if (ctx && !list_empty_careful(&ctx->flc_lease)) {
5888 -               percpu_down_read_preempt_disable(&file_rwsem);
5889 +               percpu_down_read(&file_rwsem);
5890                 spin_lock(&ctx->flc_lock);
5891                 time_out_leases(inode, &dispose);
5892                 list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5893 @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp)
5894                         break;
5895                 }
5896                 spin_unlock(&ctx->flc_lock);
5897 -               percpu_up_read_preempt_enable(&file_rwsem);
5898 +               percpu_up_read(&file_rwsem);
5899
5900                 locks_dispose_list(&dispose);
5901         }
5902 @@ -1694,7 +1694,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5903                 return -EINVAL;
5904         }
5905
5906 -       percpu_down_read_preempt_disable(&file_rwsem);
5907 +       percpu_down_read(&file_rwsem);
5908         spin_lock(&ctx->flc_lock);
5909         time_out_leases(inode, &dispose);
5910         error = check_conflicting_open(dentry, arg, lease->fl_flags);
5911 @@ -1765,7 +1765,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
5912                 lease->fl_lmops->lm_setup(lease, priv);
5913  out:
5914         spin_unlock(&ctx->flc_lock);
5915 -       percpu_up_read_preempt_enable(&file_rwsem);
5916 +       percpu_up_read(&file_rwsem);
5917         locks_dispose_list(&dispose);
5918         if (is_deleg)
5919                 inode_unlock(inode);
5920 @@ -1788,7 +1788,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
5921                 return error;
5922         }
5923
5924 -       percpu_down_read_preempt_disable(&file_rwsem);
5925 +       percpu_down_read(&file_rwsem);
5926         spin_lock(&ctx->flc_lock);
5927         list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
5928                 if (fl->fl_file == filp &&
5929 @@ -1801,7 +1801,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
5930         if (victim)
5931                 error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
5932         spin_unlock(&ctx->flc_lock);
5933 -       percpu_up_read_preempt_enable(&file_rwsem);
5934 +       percpu_up_read(&file_rwsem);
5935         locks_dispose_list(&dispose);
5936         return error;
5937  }
5938 @@ -2532,13 +2532,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
5939         if (list_empty(&ctx->flc_lease))
5940                 return;
5941
5942 -       percpu_down_read_preempt_disable(&file_rwsem);
5943 +       percpu_down_read(&file_rwsem);
5944         spin_lock(&ctx->flc_lock);
5945         list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
5946                 if (filp == fl->fl_file)
5947                         lease_modify(fl, F_UNLCK, &dispose);
5948         spin_unlock(&ctx->flc_lock);
5949 -       percpu_up_read_preempt_enable(&file_rwsem);
5950 +       percpu_up_read(&file_rwsem);
5951
5952         locks_dispose_list(&dispose);
5953  }
5954 diff --git a/fs/namei.c b/fs/namei.c
5955 index 5b4eed221530..9c8dd3c83a80 100644
5956 --- a/fs/namei.c
5957 +++ b/fs/namei.c
5958 @@ -1629,7 +1629,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
5959  {
5960         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
5961         struct inode *inode = dir->d_inode;
5962 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5963 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5964
5965         inode_lock_shared(inode);
5966         /* Don't go there if it's already dead */
5967 @@ -3086,7 +3086,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
5968         struct dentry *dentry;
5969         int error, create_error = 0;
5970         umode_t mode = op->mode;
5971 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
5972 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
5973
5974         if (unlikely(IS_DEADDIR(dir_inode)))
5975                 return -ENOENT;
5976 diff --git a/fs/namespace.c b/fs/namespace.c
5977 index 7cea503ae06d..cb15f5397991 100644
5978 --- a/fs/namespace.c
5979 +++ b/fs/namespace.c
5980 @@ -14,6 +14,7 @@
5981  #include <linux/mnt_namespace.h>
5982  #include <linux/user_namespace.h>
5983  #include <linux/namei.h>
5984 +#include <linux/delay.h>
5985  #include <linux/security.h>
5986  #include <linux/idr.h>
5987  #include <linux/init.h>                /* init_rootfs */
5988 @@ -356,8 +357,11 @@ int __mnt_want_write(struct vfsmount *m)
5989          * incremented count after it has set MNT_WRITE_HOLD.
5990          */
5991         smp_mb();
5992 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
5993 -               cpu_relax();
5994 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
5995 +               preempt_enable();
5996 +               cpu_chill();
5997 +               preempt_disable();
5998 +       }
5999         /*
6000          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
6001          * be set to match its requirements. So we must not load that until
6002 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
6003 index dff600ae0d74..d726d2e09353 100644
6004 --- a/fs/nfs/delegation.c
6005 +++ b/fs/nfs/delegation.c
6006 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
6007                 sp = state->owner;
6008                 /* Block nfs4_proc_unlck */
6009                 mutex_lock(&sp->so_delegreturn_mutex);
6010 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6011 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
6012                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
6013                 if (!err)
6014                         err = nfs_delegation_claim_locks(ctx, state, stateid);
6015 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6016 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
6017                         err = -EAGAIN;
6018                 mutex_unlock(&sp->so_delegreturn_mutex);
6019                 put_nfs_open_context(ctx);
6020 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
6021 index 53e02b8bd9bd..a66e7d77cfbb 100644
6022 --- a/fs/nfs/dir.c
6023 +++ b/fs/nfs/dir.c
6024 @@ -485,7 +485,7 @@ static
6025  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
6026  {
6027         struct qstr filename = QSTR_INIT(entry->name, entry->len);
6028 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6029 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6030         struct dentry *dentry;
6031         struct dentry *alias;
6032         struct inode *dir = d_inode(parent);
6033 @@ -1487,7 +1487,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
6034                     struct file *file, unsigned open_flags,
6035                     umode_t mode, int *opened)
6036  {
6037 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6038 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6039         struct nfs_open_context *ctx;
6040         struct dentry *res;
6041         struct iattr attr = { .ia_valid = ATTR_OPEN };
6042 @@ -1802,7 +1802,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6043
6044         trace_nfs_rmdir_enter(dir, dentry);
6045         if (d_really_is_positive(dentry)) {
6046 +#ifdef CONFIG_PREEMPT_RT_BASE
6047 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
6048 +#else
6049                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6050 +#endif
6051                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6052                 /* Ensure the VFS deletes this inode */
6053                 switch (error) {
6054 @@ -1812,7 +1816,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
6055                 case -ENOENT:
6056                         nfs_dentry_handle_enoent(dentry);
6057                 }
6058 +#ifdef CONFIG_PREEMPT_RT_BASE
6059 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
6060 +#else
6061                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
6062 +#endif
6063         } else
6064                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
6065         trace_nfs_rmdir_exit(dir, dentry, error);
6066 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
6067 index bf4ec5ecc97e..36cd5fc9192c 100644
6068 --- a/fs/nfs/inode.c
6069 +++ b/fs/nfs/inode.c
6070 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
6071         nfsi->nrequests = 0;
6072         nfsi->commit_info.ncommit = 0;
6073         atomic_set(&nfsi->commit_info.rpcs_out, 0);
6074 +#ifdef CONFIG_PREEMPT_RT_BASE
6075 +       sema_init(&nfsi->rmdir_sem, 1);
6076 +#else
6077         init_rwsem(&nfsi->rmdir_sem);
6078 +#endif
6079         nfs4_init_once(nfsi);
6080  }
6081
6082 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
6083 index 1452177c822d..f43b01d54c59 100644
6084 --- a/fs/nfs/nfs4_fs.h
6085 +++ b/fs/nfs/nfs4_fs.h
6086 @@ -111,7 +111,7 @@ struct nfs4_state_owner {
6087         unsigned long        so_flags;
6088         struct list_head     so_states;
6089         struct nfs_seqid_counter so_seqid;
6090 -       seqcount_t           so_reclaim_seqcount;
6091 +       seqlock_t            so_reclaim_seqlock;
6092         struct mutex         so_delegreturn_mutex;
6093  };
6094
6095 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
6096 index 241da19b7da4..8f9636cc298f 100644
6097 --- a/fs/nfs/nfs4proc.c
6098 +++ b/fs/nfs/nfs4proc.c
6099 @@ -2697,7 +2697,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6100         unsigned int seq;
6101         int ret;
6102
6103 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
6104 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6105
6106         ret = _nfs4_proc_open(opendata);
6107         if (ret != 0)
6108 @@ -2735,7 +2735,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
6109         ctx->state = state;
6110         if (d_inode(dentry) == state->inode) {
6111                 nfs_inode_attach_open_context(ctx);
6112 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
6113 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
6114                         nfs4_schedule_stateid_recovery(server, state);
6115         }
6116  out:
6117 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
6118 index 0959c9661662..dabd834d7686 100644
6119 --- a/fs/nfs/nfs4state.c
6120 +++ b/fs/nfs/nfs4state.c
6121 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
6122         nfs4_init_seqid_counter(&sp->so_seqid);
6123         atomic_set(&sp->so_count, 1);
6124         INIT_LIST_HEAD(&sp->so_lru);
6125 -       seqcount_init(&sp->so_reclaim_seqcount);
6126 +       seqlock_init(&sp->so_reclaim_seqlock);
6127         mutex_init(&sp->so_delegreturn_mutex);
6128         return sp;
6129  }
6130 @@ -1497,8 +1497,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6131          * recovering after a network partition or a reboot from a
6132          * server that doesn't support a grace period.
6133          */
6134 +#ifdef CONFIG_PREEMPT_RT_FULL
6135 +       write_seqlock(&sp->so_reclaim_seqlock);
6136 +#else
6137 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
6138 +#endif
6139         spin_lock(&sp->so_lock);
6140 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
6141  restart:
6142         list_for_each_entry(state, &sp->so_states, open_states) {
6143                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
6144 @@ -1567,14 +1571,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
6145                 spin_lock(&sp->so_lock);
6146                 goto restart;
6147         }
6148 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6149         spin_unlock(&sp->so_lock);
6150 +#ifdef CONFIG_PREEMPT_RT_FULL
6151 +       write_sequnlock(&sp->so_reclaim_seqlock);
6152 +#else
6153 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6154 +#endif
6155         return 0;
6156  out_err:
6157         nfs4_put_open_state(state);
6158 -       spin_lock(&sp->so_lock);
6159 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
6160 -       spin_unlock(&sp->so_lock);
6161 +#ifdef CONFIG_PREEMPT_RT_FULL
6162 +       write_sequnlock(&sp->so_reclaim_seqlock);
6163 +#else
6164 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
6165 +#endif
6166         return status;
6167  }
6168
6169 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
6170 index 191aa577dd1f..58990c8f52e0 100644
6171 --- a/fs/nfs/unlink.c
6172 +++ b/fs/nfs/unlink.c
6173 @@ -12,7 +12,7 @@
6174  #include <linux/sunrpc/clnt.h>
6175  #include <linux/nfs_fs.h>
6176  #include <linux/sched.h>
6177 -#include <linux/wait.h>
6178 +#include <linux/swait.h>
6179  #include <linux/namei.h>
6180  #include <linux/fsnotify.h>
6181
6182 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
6183                 rpc_restart_call_prepare(task);
6184  }
6185
6186 +#ifdef CONFIG_PREEMPT_RT_BASE
6187 +static void nfs_down_anon(struct semaphore *sema)
6188 +{
6189 +       down(sema);
6190 +}
6191 +
6192 +static void nfs_up_anon(struct semaphore *sema)
6193 +{
6194 +       up(sema);
6195 +}
6196 +
6197 +#else
6198 +static void nfs_down_anon(struct rw_semaphore *rwsem)
6199 +{
6200 +       down_read_non_owner(rwsem);
6201 +}
6202 +
6203 +static void nfs_up_anon(struct rw_semaphore *rwsem)
6204 +{
6205 +       up_read_non_owner(rwsem);
6206 +}
6207 +#endif
6208 +
6209  /**
6210   * nfs_async_unlink_release - Release the sillydelete data.
6211   * @task: rpc_task of the sillydelete
6212 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
6213         struct dentry *dentry = data->dentry;
6214         struct super_block *sb = dentry->d_sb;
6215
6216 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6217 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
6218         d_lookup_done(dentry);
6219         nfs_free_unlinkdata(data);
6220         dput(dentry);
6221 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6222         struct inode *dir = d_inode(dentry->d_parent);
6223         struct dentry *alias;
6224
6225 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
6226 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
6227         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
6228         if (IS_ERR(alias)) {
6229 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6230 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6231                 return 0;
6232         }
6233         if (!d_in_lookup(alias)) {
6234 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
6235                         ret = 0;
6236                 spin_unlock(&alias->d_lock);
6237                 dput(alias);
6238 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
6239 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
6240                 /*
6241                  * If we'd displaced old cached devname, free it.  At that
6242                  * point dentry is definitely not a root, so we won't need
6243 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
6244                 goto out_free_name;
6245         }
6246         data->res.dir_attr = &data->dir_attr;
6247 -       init_waitqueue_head(&data->wq);
6248 +       init_swait_queue_head(&data->wq);
6249
6250         status = -EBUSY;
6251         spin_lock(&dentry->d_lock);
6252 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
6253 index fe251f187ff8..e89da4fb14c2 100644
6254 --- a/fs/ntfs/aops.c
6255 +++ b/fs/ntfs/aops.c
6256 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6257                         ofs = 0;
6258                         if (file_ofs < init_size)
6259                                 ofs = init_size - file_ofs;
6260 -                       local_irq_save(flags);
6261 +                       local_irq_save_nort(flags);
6262                         kaddr = kmap_atomic(page);
6263                         memset(kaddr + bh_offset(bh) + ofs, 0,
6264                                         bh->b_size - ofs);
6265                         flush_dcache_page(page);
6266                         kunmap_atomic(kaddr);
6267 -                       local_irq_restore(flags);
6268 +                       local_irq_restore_nort(flags);
6269                 }
6270         } else {
6271                 clear_buffer_uptodate(bh);
6272 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6273                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
6274         }
6275         first = page_buffers(page);
6276 -       local_irq_save(flags);
6277 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6278 +       flags = bh_uptodate_lock_irqsave(first);
6279         clear_buffer_async_read(bh);
6280         unlock_buffer(bh);
6281         tmp = bh;
6282 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6283                 }
6284                 tmp = tmp->b_this_page;
6285         } while (tmp != bh);
6286 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6287 -       local_irq_restore(flags);
6288 +       bh_uptodate_unlock_irqrestore(first, flags);
6289         /*
6290          * If none of the buffers had errors then we can set the page uptodate,
6291          * but we first have to perform the post read mst fixups, if the
6292 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6293                 recs = PAGE_SIZE / rec_size;
6294                 /* Should have been verified before we got here... */
6295                 BUG_ON(!recs);
6296 -               local_irq_save(flags);
6297 +               local_irq_save_nort(flags);
6298                 kaddr = kmap_atomic(page);
6299                 for (i = 0; i < recs; i++)
6300                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
6301                                         i * rec_size), rec_size);
6302                 kunmap_atomic(kaddr);
6303 -               local_irq_restore(flags);
6304 +               local_irq_restore_nort(flags);
6305                 flush_dcache_page(page);
6306                 if (likely(page_uptodate && !PageError(page)))
6307                         SetPageUptodate(page);
6308 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
6309         unlock_page(page);
6310         return;
6311  still_busy:
6312 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6313 -       local_irq_restore(flags);
6314 -       return;
6315 +       bh_uptodate_unlock_irqrestore(first, flags);
6316  }
6317
6318  /**
6319 diff --git a/fs/proc/base.c b/fs/proc/base.c
6320 index ca651ac00660..41d9dc789285 100644
6321 --- a/fs/proc/base.c
6322 +++ b/fs/proc/base.c
6323 @@ -1834,7 +1834,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
6324
6325         child = d_hash_and_lookup(dir, &qname);
6326         if (!child) {
6327 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6328 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6329                 child = d_alloc_parallel(dir, &qname, &wq);
6330                 if (IS_ERR(child))
6331                         goto end_instantiate;
6332 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
6333 index d4e37acd4821..000cea46434a 100644
6334 --- a/fs/proc/proc_sysctl.c
6335 +++ b/fs/proc/proc_sysctl.c
6336 @@ -632,7 +632,7 @@ static bool proc_sys_fill_cache(struct file *file,
6337
6338         child = d_lookup(dir, &qname);
6339         if (!child) {
6340 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6341 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6342                 child = d_alloc_parallel(dir, &qname, &wq);
6343                 if (IS_ERR(child))
6344                         return false;
6345 diff --git a/fs/timerfd.c b/fs/timerfd.c
6346 index 9ae4abb4110b..8644b67c48fd 100644
6347 --- a/fs/timerfd.c
6348 +++ b/fs/timerfd.c
6349 @@ -460,7 +460,10 @@ static int do_timerfd_settime(int ufd, int flags,
6350                                 break;
6351                 }
6352                 spin_unlock_irq(&ctx->wqh.lock);
6353 -               cpu_relax();
6354 +               if (isalarm(ctx))
6355 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
6356 +               else
6357 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
6358         }
6359
6360         /*
6361 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
6362 index e861a24f06f2..b5c97d3059c7 100644
6363 --- a/include/acpi/platform/aclinux.h
6364 +++ b/include/acpi/platform/aclinux.h
6365 @@ -133,6 +133,7 @@
6366
6367  #define acpi_cache_t                        struct kmem_cache
6368  #define acpi_spinlock                       spinlock_t *
6369 +#define acpi_raw_spinlock              raw_spinlock_t *
6370  #define acpi_cpu_flags                      unsigned long
6371
6372  /* Use native linux version of acpi_os_allocate_zeroed */
6373 @@ -151,6 +152,20 @@
6374  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
6375  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
6376
6377 +#define acpi_os_create_raw_lock(__handle)                      \
6378 +({                                                             \
6379 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
6380 +                                                               \
6381 +        if (lock) {                                            \
6382 +               *(__handle) = lock;                             \
6383 +               raw_spin_lock_init(*(__handle));                \
6384 +        }                                                      \
6385 +        lock ? AE_OK : AE_NO_MEMORY;                           \
6386 + })
6387 +
6388 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
6389 +
6390 +
6391  /*
6392   * OSL interfaces used by debugger/disassembler
6393   */
6394 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
6395 index 6f96247226a4..fa53a21263c2 100644
6396 --- a/include/asm-generic/bug.h
6397 +++ b/include/asm-generic/bug.h
6398 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
6399  # define WARN_ON_SMP(x)                        ({0;})
6400  #endif
6401
6402 +#ifdef CONFIG_PREEMPT_RT_BASE
6403 +# define BUG_ON_RT(c)                  BUG_ON(c)
6404 +# define BUG_ON_NONRT(c)               do { } while (0)
6405 +# define WARN_ON_RT(condition)         WARN_ON(condition)
6406 +# define WARN_ON_NONRT(condition)      do { } while (0)
6407 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
6408 +#else
6409 +# define BUG_ON_RT(c)                  do { } while (0)
6410 +# define BUG_ON_NONRT(c)               BUG_ON(c)
6411 +# define WARN_ON_RT(condition)         do { } while (0)
6412 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
6413 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
6414 +#endif
6415 +
6416  #endif /* __ASSEMBLY__ */
6417
6418  #endif
6419 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
6420 index 535ab2e13d2e..cfc246899473 100644
6421 --- a/include/linux/blk-mq.h
6422 +++ b/include/linux/blk-mq.h
6423 @@ -209,7 +209,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
6424         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
6425  }
6426
6427 -
6428 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
6429  int blk_mq_request_started(struct request *rq);
6430  void blk_mq_start_request(struct request *rq);
6431  void blk_mq_end_request(struct request *rq, int error);
6432 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
6433 index f6a816129856..ec7a4676f8a8 100644
6434 --- a/include/linux/blkdev.h
6435 +++ b/include/linux/blkdev.h
6436 @@ -89,6 +89,7 @@ struct request {
6437         struct list_head queuelist;
6438         union {
6439                 struct call_single_data csd;
6440 +               struct work_struct work;
6441                 u64 fifo_time;
6442         };
6443
6444 @@ -467,7 +468,7 @@ struct request_queue {
6445         struct throtl_data *td;
6446  #endif
6447         struct rcu_head         rcu_head;
6448 -       wait_queue_head_t       mq_freeze_wq;
6449 +       struct swait_queue_head mq_freeze_wq;
6450         struct percpu_ref       q_usage_counter;
6451         struct list_head        all_q_node;
6452
6453 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
6454 index 8fdcb783197d..d07dbeec7bc1 100644
6455 --- a/include/linux/bottom_half.h
6456 +++ b/include/linux/bottom_half.h
6457 @@ -3,6 +3,39 @@
6458
6459  #include <linux/preempt.h>
6460
6461 +#ifdef CONFIG_PREEMPT_RT_FULL
6462 +
6463 +extern void __local_bh_disable(void);
6464 +extern void _local_bh_enable(void);
6465 +extern void __local_bh_enable(void);
6466 +
6467 +static inline void local_bh_disable(void)
6468 +{
6469 +       __local_bh_disable();
6470 +}
6471 +
6472 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
6473 +{
6474 +       __local_bh_disable();
6475 +}
6476 +
6477 +static inline void local_bh_enable(void)
6478 +{
6479 +       __local_bh_enable();
6480 +}
6481 +
6482 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
6483 +{
6484 +       __local_bh_enable();
6485 +}
6486 +
6487 +static inline void local_bh_enable_ip(unsigned long ip)
6488 +{
6489 +       __local_bh_enable();
6490 +}
6491 +
6492 +#else
6493 +
6494  #ifdef CONFIG_TRACE_IRQFLAGS
6495  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
6496  #else
6497 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
6498  {
6499         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
6500  }
6501 +#endif
6502
6503  #endif /* _LINUX_BH_H */
6504 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
6505 index ebbacd14d450..be5e87f6360a 100644
6506 --- a/include/linux/buffer_head.h
6507 +++ b/include/linux/buffer_head.h
6508 @@ -75,8 +75,50 @@ struct buffer_head {
6509         struct address_space *b_assoc_map;      /* mapping this buffer is
6510                                                    associated with */
6511         atomic_t b_count;               /* users using this buffer_head */
6512 +#ifdef CONFIG_PREEMPT_RT_BASE
6513 +       spinlock_t b_uptodate_lock;
6514 +#if IS_ENABLED(CONFIG_JBD2)
6515 +       spinlock_t b_state_lock;
6516 +       spinlock_t b_journal_head_lock;
6517 +#endif
6518 +#endif
6519  };
6520
6521 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
6522 +{
6523 +       unsigned long flags;
6524 +
6525 +#ifndef CONFIG_PREEMPT_RT_BASE
6526 +       local_irq_save(flags);
6527 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
6528 +#else
6529 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
6530 +#endif
6531 +       return flags;
6532 +}
6533 +
6534 +static inline void
6535 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
6536 +{
6537 +#ifndef CONFIG_PREEMPT_RT_BASE
6538 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
6539 +       local_irq_restore(flags);
6540 +#else
6541 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
6542 +#endif
6543 +}
6544 +
6545 +static inline void buffer_head_init_locks(struct buffer_head *bh)
6546 +{
6547 +#ifdef CONFIG_PREEMPT_RT_BASE
6548 +       spin_lock_init(&bh->b_uptodate_lock);
6549 +#if IS_ENABLED(CONFIG_JBD2)
6550 +       spin_lock_init(&bh->b_state_lock);
6551 +       spin_lock_init(&bh->b_journal_head_lock);
6552 +#endif
6553 +#endif
6554 +}
6555 +
6556  /*
6557   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
6558   * and buffer_foo() functions.
6559 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
6560 index 5b17de62c962..56027cc01a56 100644
6561 --- a/include/linux/cgroup-defs.h
6562 +++ b/include/linux/cgroup-defs.h
6563 @@ -16,6 +16,7 @@
6564  #include <linux/percpu-refcount.h>
6565  #include <linux/percpu-rwsem.h>
6566  #include <linux/workqueue.h>
6567 +#include <linux/swork.h>
6568
6569  #ifdef CONFIG_CGROUPS
6570
6571 @@ -137,6 +138,7 @@ struct cgroup_subsys_state {
6572         /* percpu_ref killing and RCU release */
6573         struct rcu_head rcu_head;
6574         struct work_struct destroy_work;
6575 +       struct swork_event destroy_swork;
6576  };
6577
6578  /*
6579 diff --git a/include/linux/completion.h b/include/linux/completion.h
6580 index 5d5aaae3af43..3bca1590e29f 100644
6581 --- a/include/linux/completion.h
6582 +++ b/include/linux/completion.h
6583 @@ -7,8 +7,7 @@
6584   * Atomic wait-for-completion handler data structures.
6585   * See kernel/sched/completion.c for details.
6586   */
6587 -
6588 -#include <linux/wait.h>
6589 +#include <linux/swait.h>
6590
6591  /*
6592   * struct completion - structure used to maintain state for a "completion"
6593 @@ -24,11 +23,11 @@
6594   */
6595  struct completion {
6596         unsigned int done;
6597 -       wait_queue_head_t wait;
6598 +       struct swait_queue_head wait;
6599  };
6600
6601  #define COMPLETION_INITIALIZER(work) \
6602 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6603 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
6604
6605  #define COMPLETION_INITIALIZER_ONSTACK(work) \
6606         ({ init_completion(&work); work; })
6607 @@ -73,7 +72,7 @@ struct completion {
6608  static inline void init_completion(struct completion *x)
6609  {
6610         x->done = 0;
6611 -       init_waitqueue_head(&x->wait);
6612 +       init_swait_queue_head(&x->wait);
6613  }
6614
6615  /**
6616 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
6617 index e571128ad99a..5e52d28c20c1 100644
6618 --- a/include/linux/cpu.h
6619 +++ b/include/linux/cpu.h
6620 @@ -182,6 +182,8 @@ extern void get_online_cpus(void);
6621  extern void put_online_cpus(void);
6622  extern void cpu_hotplug_disable(void);
6623  extern void cpu_hotplug_enable(void);
6624 +extern void pin_current_cpu(void);
6625 +extern void unpin_current_cpu(void);
6626  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
6627  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
6628  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
6629 @@ -199,6 +201,8 @@ static inline void cpu_hotplug_done(void) {}
6630  #define put_online_cpus()      do { } while (0)
6631  #define cpu_hotplug_disable()  do { } while (0)
6632  #define cpu_hotplug_enable()   do { } while (0)
6633 +static inline void pin_current_cpu(void) { }
6634 +static inline void unpin_current_cpu(void) { }
6635  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
6636  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
6637  /* These aren't inline functions due to a GCC bug. */
6638 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
6639 index 5beed7b30561..61cab7ef458e 100644
6640 --- a/include/linux/dcache.h
6641 +++ b/include/linux/dcache.h
6642 @@ -11,6 +11,7 @@
6643  #include <linux/rcupdate.h>
6644  #include <linux/lockref.h>
6645  #include <linux/stringhash.h>
6646 +#include <linux/wait.h>
6647
6648  struct path;
6649  struct vfsmount;
6650 @@ -100,7 +101,7 @@ struct dentry {
6651
6652         union {
6653                 struct list_head d_lru;         /* LRU list */
6654 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
6655 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
6656         };
6657         struct list_head d_child;       /* child of parent list */
6658         struct list_head d_subdirs;     /* our children */
6659 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
6660  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
6661  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
6662  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
6663 -                                       wait_queue_head_t *);
6664 +                                       struct swait_queue_head *);
6665  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
6666  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
6667  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
6668 diff --git a/include/linux/delay.h b/include/linux/delay.h
6669 index a6ecb34cf547..37caab306336 100644
6670 --- a/include/linux/delay.h
6671 +++ b/include/linux/delay.h
6672 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
6673         msleep(seconds * 1000);
6674  }
6675
6676 +#ifdef CONFIG_PREEMPT_RT_FULL
6677 +extern void cpu_chill(void);
6678 +#else
6679 +# define cpu_chill()   cpu_relax()
6680 +#endif
6681 +
6682  #endif /* defined(_LINUX_DELAY_H) */
6683 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
6684 index bb3f3297062a..a117a33ef72c 100644
6685 --- a/include/linux/highmem.h
6686 +++ b/include/linux/highmem.h
6687 @@ -7,6 +7,7 @@
6688  #include <linux/mm.h>
6689  #include <linux/uaccess.h>
6690  #include <linux/hardirq.h>
6691 +#include <linux/sched.h>
6692
6693  #include <asm/cacheflush.h>
6694
6695 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
6696
6697  static inline void *kmap_atomic(struct page *page)
6698  {
6699 -       preempt_disable();
6700 +       preempt_disable_nort();
6701         pagefault_disable();
6702         return page_address(page);
6703  }
6704 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
6705  static inline void __kunmap_atomic(void *addr)
6706  {
6707         pagefault_enable();
6708 -       preempt_enable();
6709 +       preempt_enable_nort();
6710  }
6711
6712  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
6713 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
6714
6715  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
6716
6717 +#ifndef CONFIG_PREEMPT_RT_FULL
6718  DECLARE_PER_CPU(int, __kmap_atomic_idx);
6719 +#endif
6720
6721  static inline int kmap_atomic_idx_push(void)
6722  {
6723 +#ifndef CONFIG_PREEMPT_RT_FULL
6724         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
6725
6726 -#ifdef CONFIG_DEBUG_HIGHMEM
6727 +# ifdef CONFIG_DEBUG_HIGHMEM
6728         WARN_ON_ONCE(in_irq() && !irqs_disabled());
6729         BUG_ON(idx >= KM_TYPE_NR);
6730 -#endif
6731 +# endif
6732         return idx;
6733 +#else
6734 +       current->kmap_idx++;
6735 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
6736 +       return current->kmap_idx - 1;
6737 +#endif
6738  }
6739
6740  static inline int kmap_atomic_idx(void)
6741  {
6742 +#ifndef CONFIG_PREEMPT_RT_FULL
6743         return __this_cpu_read(__kmap_atomic_idx) - 1;
6744 +#else
6745 +       return current->kmap_idx - 1;
6746 +#endif
6747  }
6748
6749  static inline void kmap_atomic_idx_pop(void)
6750  {
6751 -#ifdef CONFIG_DEBUG_HIGHMEM
6752 +#ifndef CONFIG_PREEMPT_RT_FULL
6753 +# ifdef CONFIG_DEBUG_HIGHMEM
6754         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
6755
6756         BUG_ON(idx < 0);
6757 -#else
6758 +# else
6759         __this_cpu_dec(__kmap_atomic_idx);
6760 +# endif
6761 +#else
6762 +       current->kmap_idx--;
6763 +# ifdef CONFIG_DEBUG_HIGHMEM
6764 +       BUG_ON(current->kmap_idx < 0);
6765 +# endif
6766  #endif
6767  }
6768
6769 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
6770 index 5e00f80b1535..65d0671f20b4 100644
6771 --- a/include/linux/hrtimer.h
6772 +++ b/include/linux/hrtimer.h
6773 @@ -87,6 +87,9 @@ enum hrtimer_restart {
6774   * @function:  timer expiry callback function
6775   * @base:      pointer to the timer base (per cpu and per clock)
6776   * @state:     state information (See bit values above)
6777 + * @cb_entry:  list entry to defer timers from hardirq context
6778 + * @irqsafe:   timer can run in hardirq context
6779 + * @praecox:   timer expiry time if expired at the time of programming
6780   * @is_rel:    Set if the timer was armed relative
6781   * @start_pid:  timer statistics field to store the pid of the task which
6782   *             started the timer
6783 @@ -103,6 +106,11 @@ struct hrtimer {
6784         enum hrtimer_restart            (*function)(struct hrtimer *);
6785         struct hrtimer_clock_base       *base;
6786         u8                              state;
6787 +       struct list_head                cb_entry;
6788 +       int                             irqsafe;
6789 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
6790 +       ktime_t                         praecox;
6791 +#endif
6792         u8                              is_rel;
6793  #ifdef CONFIG_TIMER_STATS
6794         int                             start_pid;
6795 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
6796         struct task_struct *task;
6797  };
6798
6799 -#ifdef CONFIG_64BIT
6800  # define HRTIMER_CLOCK_BASE_ALIGN      64
6801 -#else
6802 -# define HRTIMER_CLOCK_BASE_ALIGN      32
6803 -#endif
6804
6805  /**
6806   * struct hrtimer_clock_base - the timer base for a specific clock
6807 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
6808   *                     timer to a base on another cpu.
6809   * @clockid:           clock id for per_cpu support
6810   * @active:            red black tree root node for the active timers
6811 + * @expired:           list head for deferred timers.
6812   * @get_time:          function to retrieve the current time of the clock
6813   * @offset:            offset of this clock to the monotonic base
6814   */
6815 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
6816         int                     index;
6817         clockid_t               clockid;
6818         struct timerqueue_head  active;
6819 +       struct list_head        expired;
6820         ktime_t                 (*get_time)(void);
6821         ktime_t                 offset;
6822  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
6823 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
6824         raw_spinlock_t                  lock;
6825         seqcount_t                      seq;
6826         struct hrtimer                  *running;
6827 +       struct hrtimer                  *running_soft;
6828         unsigned int                    cpu;
6829         unsigned int                    active_bases;
6830         unsigned int                    clock_was_set_seq;
6831 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
6832         unsigned int                    nr_hangs;
6833         unsigned int                    max_hang_time;
6834  #endif
6835 +#ifdef CONFIG_PREEMPT_RT_BASE
6836 +       wait_queue_head_t               wait;
6837 +#endif
6838         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
6839  } ____cacheline_aligned;
6840
6841 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
6842         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
6843  }
6844
6845 +/* Softirq preemption could deadlock timer removal */
6846 +#ifdef CONFIG_PREEMPT_RT_BASE
6847 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
6848 +#else
6849 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
6850 +#endif
6851 +
6852  /* Query timers: */
6853  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
6854
6855 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
6856   * Helper function to check, whether the timer is running the callback
6857   * function
6858   */
6859 -static inline int hrtimer_callback_running(struct hrtimer *timer)
6860 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
6861  {
6862         return timer->base->cpu_base->running == timer;
6863  }
6864 diff --git a/include/linux/idr.h b/include/linux/idr.h
6865 index 083d61e92706..5899796f50cb 100644
6866 --- a/include/linux/idr.h
6867 +++ b/include/linux/idr.h
6868 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
6869   * Each idr_preload() should be matched with an invocation of this
6870   * function.  See idr_preload() for details.
6871   */
6872 +#ifdef CONFIG_PREEMPT_RT_FULL
6873 +void idr_preload_end(void);
6874 +#else
6875  static inline void idr_preload_end(void)
6876  {
6877         preempt_enable();
6878  }
6879 +#endif
6880
6881  /**
6882   * idr_find - return pointer for given id
6883 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
6884 index 325f649d77ff..8af70bcc799b 100644
6885 --- a/include/linux/init_task.h
6886 +++ b/include/linux/init_task.h
6887 @@ -150,6 +150,12 @@ extern struct task_group root_task_group;
6888  # define INIT_PERF_EVENTS(tsk)
6889  #endif
6890
6891 +#ifdef CONFIG_PREEMPT_RT_BASE
6892 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
6893 +#else
6894 +# define INIT_TIMER_LIST
6895 +#endif
6896 +
6897  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
6898  # define INIT_VTIME(tsk)                                               \
6899         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
6900 @@ -250,6 +256,7 @@ extern struct task_group root_task_group;
6901         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
6902         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
6903         .timer_slack_ns = 50000, /* 50 usec default slack */            \
6904 +       INIT_TIMER_LIST                                                 \
6905         .pids = {                                                       \
6906                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
6907                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
6908 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
6909 index 72f0721f75e7..480972ae47d3 100644
6910 --- a/include/linux/interrupt.h
6911 +++ b/include/linux/interrupt.h
6912 @@ -14,6 +14,7 @@
6913  #include <linux/hrtimer.h>
6914  #include <linux/kref.h>
6915  #include <linux/workqueue.h>
6916 +#include <linux/swork.h>
6917
6918  #include <linux/atomic.h>
6919  #include <asm/ptrace.h>
6920 @@ -61,6 +62,7 @@
6921   *                interrupt handler after suspending interrupts. For system
6922   *                wakeup devices users need to implement wakeup detection in
6923   *                their interrupt handlers.
6924 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
6925   */
6926  #define IRQF_SHARED            0x00000080
6927  #define IRQF_PROBE_SHARED      0x00000100
6928 @@ -74,6 +76,7 @@
6929  #define IRQF_NO_THREAD         0x00010000
6930  #define IRQF_EARLY_RESUME      0x00020000
6931  #define IRQF_COND_SUSPEND      0x00040000
6932 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
6933
6934  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
6935
6936 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
6937  #ifdef CONFIG_LOCKDEP
6938  # define local_irq_enable_in_hardirq() do { } while (0)
6939  #else
6940 -# define local_irq_enable_in_hardirq() local_irq_enable()
6941 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
6942  #endif
6943
6944  extern void disable_irq_nosync(unsigned int irq);
6945 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
6946   * struct irq_affinity_notify - context for notification of IRQ affinity changes
6947   * @irq:               Interrupt to which notification applies
6948   * @kref:              Reference count, for internal use
6949 + * @swork:             Swork item, for internal use
6950   * @work:              Work item, for internal use
6951   * @notify:            Function to be called on change.  This will be
6952   *                     called in process context.
6953 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
6954  struct irq_affinity_notify {
6955         unsigned int irq;
6956         struct kref kref;
6957 +#ifdef CONFIG_PREEMPT_RT_BASE
6958 +       struct swork_event swork;
6959 +#else
6960         struct work_struct work;
6961 +#endif
6962         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
6963         void (*release)(struct kref *ref);
6964  };
6965 @@ -406,9 +414,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
6966                                  bool state);
6967
6968  #ifdef CONFIG_IRQ_FORCED_THREADING
6969 +# ifndef CONFIG_PREEMPT_RT_BASE
6970  extern bool force_irqthreads;
6971 +# else
6972 +#  define force_irqthreads     (true)
6973 +# endif
6974  #else
6975 -#define force_irqthreads       (0)
6976 +#define force_irqthreads       (false)
6977  #endif
6978
6979  #ifndef __ARCH_SET_SOFTIRQ_PENDING
6980 @@ -465,9 +477,10 @@ struct softirq_action
6981         void    (*action)(struct softirq_action *);
6982  };
6983
6984 +#ifndef CONFIG_PREEMPT_RT_FULL
6985  asmlinkage void do_softirq(void);
6986  asmlinkage void __do_softirq(void);
6987 -
6988 +static inline void thread_do_softirq(void) { do_softirq(); }
6989  #ifdef __ARCH_HAS_DO_SOFTIRQ
6990  void do_softirq_own_stack(void);
6991  #else
6992 @@ -476,13 +489,25 @@ static inline void do_softirq_own_stack(void)
6993         __do_softirq();
6994  }
6995  #endif
6996 +#else
6997 +extern void thread_do_softirq(void);
6998 +#endif
6999
7000  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
7001  extern void softirq_init(void);
7002  extern void __raise_softirq_irqoff(unsigned int nr);
7003 +#ifdef CONFIG_PREEMPT_RT_FULL
7004 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
7005 +#else
7006 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
7007 +{
7008 +       __raise_softirq_irqoff(nr);
7009 +}
7010 +#endif
7011
7012  extern void raise_softirq_irqoff(unsigned int nr);
7013  extern void raise_softirq(unsigned int nr);
7014 +extern void softirq_check_pending_idle(void);
7015
7016  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
7017
7018 @@ -504,8 +529,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
7019       to be executed on some cpu at least once after this.
7020     * If the tasklet is already scheduled, but its execution is still not
7021       started, it will be executed only once.
7022 -   * If this tasklet is already running on another CPU (or schedule is called
7023 -     from tasklet itself), it is rescheduled for later.
7024 +   * If this tasklet is already running on another CPU, it is rescheduled
7025 +     for later.
7026 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
7027     * Tasklet is strictly serialized wrt itself, but not
7028       wrt another tasklets. If client needs some intertask synchronization,
7029       he makes it with spinlocks.
7030 @@ -530,27 +556,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
7031  enum
7032  {
7033         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
7034 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
7035 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
7036 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
7037  };
7038
7039 -#ifdef CONFIG_SMP
7040 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
7041 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
7042 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
7043 +
7044 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
7045  static inline int tasklet_trylock(struct tasklet_struct *t)
7046  {
7047         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
7048  }
7049
7050 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
7051 +{
7052 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
7053 +}
7054 +
7055  static inline void tasklet_unlock(struct tasklet_struct *t)
7056  {
7057         smp_mb__before_atomic();
7058         clear_bit(TASKLET_STATE_RUN, &(t)->state);
7059  }
7060
7061 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
7062 -{
7063 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
7064 -}
7065 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
7066 +
7067  #else
7068  #define tasklet_trylock(t) 1
7069 +#define tasklet_tryunlock(t)   1
7070  #define tasklet_unlock_wait(t) do { } while (0)
7071  #define tasklet_unlock(t) do { } while (0)
7072  #endif
7073 @@ -599,12 +634,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
7074         smp_mb();
7075  }
7076
7077 -static inline void tasklet_enable(struct tasklet_struct *t)
7078 -{
7079 -       smp_mb__before_atomic();
7080 -       atomic_dec(&t->count);
7081 -}
7082 -
7083 +extern void tasklet_enable(struct tasklet_struct *t);
7084  extern void tasklet_kill(struct tasklet_struct *t);
7085  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
7086  extern void tasklet_init(struct tasklet_struct *t,
7087 @@ -635,6 +665,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
7088         tasklet_kill(&ttimer->tasklet);
7089  }
7090
7091 +#ifdef CONFIG_PREEMPT_RT_FULL
7092 +extern void softirq_early_init(void);
7093 +#else
7094 +static inline void softirq_early_init(void) { }
7095 +#endif
7096 +
7097  /*
7098   * Autoprobing for irqs:
7099   *
7100 diff --git a/include/linux/irq.h b/include/linux/irq.h
7101 index e79875574b39..177cee0c3305 100644
7102 --- a/include/linux/irq.h
7103 +++ b/include/linux/irq.h
7104 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
7105   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
7106   *                               it from the spurious interrupt detection
7107   *                               mechanism and from core side polling.
7108 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
7109   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
7110   */
7111  enum {
7112 @@ -99,13 +100,14 @@ enum {
7113         IRQ_PER_CPU_DEVID       = (1 << 17),
7114         IRQ_IS_POLLED           = (1 << 18),
7115         IRQ_DISABLE_UNLAZY      = (1 << 19),
7116 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
7117  };
7118
7119  #define IRQF_MODIFY_MASK       \
7120         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
7121          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
7122          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
7123 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
7124 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
7125
7126  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
7127
7128 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
7129 index 47b9ebd4a74f..2543aab05daa 100644
7130 --- a/include/linux/irq_work.h
7131 +++ b/include/linux/irq_work.h
7132 @@ -16,6 +16,7 @@
7133  #define IRQ_WORK_BUSY          2UL
7134  #define IRQ_WORK_FLAGS         3UL
7135  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
7136 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
7137
7138  struct irq_work {
7139         unsigned long flags;
7140 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
7141  static inline void irq_work_run(void) { }
7142  #endif
7143
7144 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
7145 +void irq_work_tick_soft(void);
7146 +#else
7147 +static inline void irq_work_tick_soft(void) { }
7148 +#endif
7149 +
7150  #endif /* _LINUX_IRQ_WORK_H */
7151 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
7152 index c9be57931b58..eeeb540971ae 100644
7153 --- a/include/linux/irqdesc.h
7154 +++ b/include/linux/irqdesc.h
7155 @@ -66,6 +66,7 @@ struct irq_desc {
7156         unsigned int            irqs_unhandled;
7157         atomic_t                threads_handled;
7158         int                     threads_handled_last;
7159 +       u64                     random_ip;
7160         raw_spinlock_t          lock;
7161         struct cpumask          *percpu_enabled;
7162         const struct cpumask    *percpu_affinity;
7163 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
7164 index 5dd1272d1ab2..9b77034f7c5e 100644
7165 --- a/include/linux/irqflags.h
7166 +++ b/include/linux/irqflags.h
7167 @@ -25,8 +25,6 @@
7168  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
7169  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
7170  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
7171 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
7172 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
7173  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
7174  #else
7175  # define trace_hardirqs_on()           do { } while (0)
7176 @@ -39,9 +37,15 @@
7177  # define trace_softirqs_enabled(p)     0
7178  # define trace_hardirq_enter()         do { } while (0)
7179  # define trace_hardirq_exit()          do { } while (0)
7180 +# define INIT_TRACE_IRQFLAGS
7181 +#endif
7182 +
7183 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
7184 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
7185 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
7186 +#else
7187  # define lockdep_softirq_enter()       do { } while (0)
7188  # define lockdep_softirq_exit()                do { } while (0)
7189 -# define INIT_TRACE_IRQFLAGS
7190  #endif
7191
7192  #if defined(CONFIG_IRQSOFF_TRACER) || \
7193 @@ -148,4 +152,23 @@
7194
7195  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
7196
7197 +/*
7198 + * local_irq* variants depending on RT/!RT
7199 + */
7200 +#ifdef CONFIG_PREEMPT_RT_FULL
7201 +# define local_irq_disable_nort()      do { } while (0)
7202 +# define local_irq_enable_nort()       do { } while (0)
7203 +# define local_irq_save_nort(flags)    local_save_flags(flags)
7204 +# define local_irq_restore_nort(flags) (void)(flags)
7205 +# define local_irq_disable_rt()                local_irq_disable()
7206 +# define local_irq_enable_rt()         local_irq_enable()
7207 +#else
7208 +# define local_irq_disable_nort()      local_irq_disable()
7209 +# define local_irq_enable_nort()       local_irq_enable()
7210 +# define local_irq_save_nort(flags)    local_irq_save(flags)
7211 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
7212 +# define local_irq_disable_rt()                do { } while (0)
7213 +# define local_irq_enable_rt()         do { } while (0)
7214 +#endif
7215 +
7216  #endif
7217 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
7218 index dfaa1f4dcb0c..d57dd06544a1 100644
7219 --- a/include/linux/jbd2.h
7220 +++ b/include/linux/jbd2.h
7221 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
7222
7223  static inline void jbd_lock_bh_state(struct buffer_head *bh)
7224  {
7225 +#ifndef CONFIG_PREEMPT_RT_BASE
7226         bit_spin_lock(BH_State, &bh->b_state);
7227 +#else
7228 +       spin_lock(&bh->b_state_lock);
7229 +#endif
7230  }
7231
7232  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
7233  {
7234 +#ifndef CONFIG_PREEMPT_RT_BASE
7235         return bit_spin_trylock(BH_State, &bh->b_state);
7236 +#else
7237 +       return spin_trylock(&bh->b_state_lock);
7238 +#endif
7239  }
7240
7241  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
7242  {
7243 +#ifndef CONFIG_PREEMPT_RT_BASE
7244         return bit_spin_is_locked(BH_State, &bh->b_state);
7245 +#else
7246 +       return spin_is_locked(&bh->b_state_lock);
7247 +#endif
7248  }
7249
7250  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
7251  {
7252 +#ifndef CONFIG_PREEMPT_RT_BASE
7253         bit_spin_unlock(BH_State, &bh->b_state);
7254 +#else
7255 +       spin_unlock(&bh->b_state_lock);
7256 +#endif
7257  }
7258
7259  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
7260  {
7261 +#ifndef CONFIG_PREEMPT_RT_BASE
7262         bit_spin_lock(BH_JournalHead, &bh->b_state);
7263 +#else
7264 +       spin_lock(&bh->b_journal_head_lock);
7265 +#endif
7266  }
7267
7268  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
7269  {
7270 +#ifndef CONFIG_PREEMPT_RT_BASE
7271         bit_spin_unlock(BH_JournalHead, &bh->b_state);
7272 +#else
7273 +       spin_unlock(&bh->b_journal_head_lock);
7274 +#endif
7275  }
7276
7277  #define J_ASSERT(assert)       BUG_ON(!(assert))
7278 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
7279 index 410decacff8f..0861bebfc188 100644
7280 --- a/include/linux/kdb.h
7281 +++ b/include/linux/kdb.h
7282 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
7283  extern __printf(1, 2) int kdb_printf(const char *, ...);
7284  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
7285
7286 +#define in_kdb_printk()        (kdb_trap_printk)
7287  extern void kdb_init(int level);
7288
7289  /* Access to kdb specific polling devices */
7290 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
7291  extern int kdb_unregister(char *);
7292  #else /* ! CONFIG_KGDB_KDB */
7293  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
7294 +#define in_kdb_printk() (0)
7295  static inline void kdb_init(int level) {}
7296  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
7297                                char *help, short minlen) { return 0; }
7298 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
7299 index bc6ed52a39b9..7894d55e4998 100644
7300 --- a/include/linux/kernel.h
7301 +++ b/include/linux/kernel.h
7302 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
7303   */
7304  # define might_sleep() \
7305         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7306 +
7307 +# define might_sleep_no_state_check() \
7308 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
7309  # define sched_annotate_sleep()        (current->task_state_change = 0)
7310  #else
7311    static inline void ___might_sleep(const char *file, int line,
7312 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
7313    static inline void __might_sleep(const char *file, int line,
7314                                    int preempt_offset) { }
7315  # define might_sleep() do { might_resched(); } while (0)
7316 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
7317  # define sched_annotate_sleep() do { } while (0)
7318  #endif
7319
7320 @@ -488,6 +492,7 @@ extern enum system_states {
7321         SYSTEM_HALT,
7322         SYSTEM_POWER_OFF,
7323         SYSTEM_RESTART,
7324 +       SYSTEM_SUSPEND,
7325  } system_state;
7326
7327  #define TAINT_PROPRIETARY_MODULE       0
7328 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
7329 index cb483305e1f5..4e5062316bb6 100644
7330 --- a/include/linux/list_bl.h
7331 +++ b/include/linux/list_bl.h
7332 @@ -2,6 +2,7 @@
7333  #define _LINUX_LIST_BL_H
7334
7335  #include <linux/list.h>
7336 +#include <linux/spinlock.h>
7337  #include <linux/bit_spinlock.h>
7338
7339  /*
7340 @@ -32,13 +33,24 @@
7341
7342  struct hlist_bl_head {
7343         struct hlist_bl_node *first;
7344 +#ifdef CONFIG_PREEMPT_RT_BASE
7345 +       raw_spinlock_t lock;
7346 +#endif
7347  };
7348
7349  struct hlist_bl_node {
7350         struct hlist_bl_node *next, **pprev;
7351  };
7352 -#define INIT_HLIST_BL_HEAD(ptr) \
7353 -       ((ptr)->first = NULL)
7354 +
7355 +#ifdef CONFIG_PREEMPT_RT_BASE
7356 +#define INIT_HLIST_BL_HEAD(h)          \
7357 +do {                                   \
7358 +       (h)->first = NULL;              \
7359 +       raw_spin_lock_init(&(h)->lock); \
7360 +} while (0)
7361 +#else
7362 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
7363 +#endif
7364
7365  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
7366  {
7367 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
7368
7369  static inline void hlist_bl_lock(struct hlist_bl_head *b)
7370  {
7371 +#ifndef CONFIG_PREEMPT_RT_BASE
7372         bit_spin_lock(0, (unsigned long *)b);
7373 +#else
7374 +       raw_spin_lock(&b->lock);
7375 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7376 +       __set_bit(0, (unsigned long *)b);
7377 +#endif
7378 +#endif
7379  }
7380
7381  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
7382  {
7383 +#ifndef CONFIG_PREEMPT_RT_BASE
7384         __bit_spin_unlock(0, (unsigned long *)b);
7385 +#else
7386 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
7387 +       __clear_bit(0, (unsigned long *)b);
7388 +#endif
7389 +       raw_spin_unlock(&b->lock);
7390 +#endif
7391  }
7392
7393  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
7394 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
7395 new file mode 100644
7396 index 000000000000..845c77f1a5ca
7397 --- /dev/null
7398 +++ b/include/linux/locallock.h
7399 @@ -0,0 +1,278 @@
7400 +#ifndef _LINUX_LOCALLOCK_H
7401 +#define _LINUX_LOCALLOCK_H
7402 +
7403 +#include <linux/percpu.h>
7404 +#include <linux/spinlock.h>
7405 +
7406 +#ifdef CONFIG_PREEMPT_RT_BASE
7407 +
7408 +#ifdef CONFIG_DEBUG_SPINLOCK
7409 +# define LL_WARN(cond) WARN_ON(cond)
7410 +#else
7411 +# define LL_WARN(cond) do { } while (0)
7412 +#endif
7413 +
7414 +/*
7415 + * per cpu lock based substitute for local_irq_*()
7416 + */
7417 +struct local_irq_lock {
7418 +       spinlock_t              lock;
7419 +       struct task_struct      *owner;
7420 +       int                     nestcnt;
7421 +       unsigned long           flags;
7422 +};
7423 +
7424 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
7425 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
7426 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
7427 +
7428 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
7429 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
7430 +
7431 +#define local_irq_lock_init(lvar)                                      \
7432 +       do {                                                            \
7433 +               int __cpu;                                              \
7434 +               for_each_possible_cpu(__cpu)                            \
7435 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
7436 +       } while (0)
7437 +
7438 +/*
7439 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
7440 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
7441 + * already takes care of the migrate_disable/enable
7442 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
7443 + */
7444 +#ifdef CONFIG_PREEMPT_RT_FULL
7445 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
7446 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
7447 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
7448 +#else
7449 +# define spin_lock_local(lock)                 spin_lock(lock)
7450 +# define spin_trylock_local(lock)              spin_trylock(lock)
7451 +# define spin_unlock_local(lock)               spin_unlock(lock)
7452 +#endif
7453 +
7454 +static inline void __local_lock(struct local_irq_lock *lv)
7455 +{
7456 +       if (lv->owner != current) {
7457 +               spin_lock_local(&lv->lock);
7458 +               LL_WARN(lv->owner);
7459 +               LL_WARN(lv->nestcnt);
7460 +               lv->owner = current;
7461 +       }
7462 +       lv->nestcnt++;
7463 +}
7464 +
7465 +#define local_lock(lvar)                                       \
7466 +       do { __local_lock(&get_local_var(lvar)); } while (0)
7467 +
7468 +#define local_lock_on(lvar, cpu)                               \
7469 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
7470 +
7471 +static inline int __local_trylock(struct local_irq_lock *lv)
7472 +{
7473 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
7474 +               LL_WARN(lv->owner);
7475 +               LL_WARN(lv->nestcnt);
7476 +               lv->owner = current;
7477 +               lv->nestcnt = 1;
7478 +               return 1;
7479 +       }
7480 +       return 0;
7481 +}
7482 +
7483 +#define local_trylock(lvar)                                            \
7484 +       ({                                                              \
7485 +               int __locked;                                           \
7486 +               __locked = __local_trylock(&get_local_var(lvar));       \
7487 +               if (!__locked)                                          \
7488 +                       put_local_var(lvar);                            \
7489 +               __locked;                                               \
7490 +       })
7491 +
7492 +static inline void __local_unlock(struct local_irq_lock *lv)
7493 +{
7494 +       LL_WARN(lv->nestcnt == 0);
7495 +       LL_WARN(lv->owner != current);
7496 +       if (--lv->nestcnt)
7497 +               return;
7498 +
7499 +       lv->owner = NULL;
7500 +       spin_unlock_local(&lv->lock);
7501 +}
7502 +
7503 +#define local_unlock(lvar)                                     \
7504 +       do {                                                    \
7505 +               __local_unlock(this_cpu_ptr(&lvar));            \
7506 +               put_local_var(lvar);                            \
7507 +       } while (0)
7508 +
7509 +#define local_unlock_on(lvar, cpu)                       \
7510 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
7511 +
7512 +static inline void __local_lock_irq(struct local_irq_lock *lv)
7513 +{
7514 +       spin_lock_irqsave(&lv->lock, lv->flags);
7515 +       LL_WARN(lv->owner);
7516 +       LL_WARN(lv->nestcnt);
7517 +       lv->owner = current;
7518 +       lv->nestcnt = 1;
7519 +}
7520 +
7521 +#define local_lock_irq(lvar)                                           \
7522 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
7523 +
7524 +#define local_lock_irq_on(lvar, cpu)                                   \
7525 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
7526 +
7527 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
7528 +{
7529 +       LL_WARN(!lv->nestcnt);
7530 +       LL_WARN(lv->owner != current);
7531 +       lv->owner = NULL;
7532 +       lv->nestcnt = 0;
7533 +       spin_unlock_irq(&lv->lock);
7534 +}
7535 +
7536 +#define local_unlock_irq(lvar)                                         \
7537 +       do {                                                            \
7538 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
7539 +               put_local_var(lvar);                                    \
7540 +       } while (0)
7541 +
7542 +#define local_unlock_irq_on(lvar, cpu)                                 \
7543 +       do {                                                            \
7544 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
7545 +       } while (0)
7546 +
7547 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
7548 +{
7549 +       if (lv->owner != current) {
7550 +               __local_lock_irq(lv);
7551 +               return 0;
7552 +       } else {
7553 +               lv->nestcnt++;
7554 +               return 1;
7555 +       }
7556 +}
7557 +
7558 +#define local_lock_irqsave(lvar, _flags)                               \
7559 +       do {                                                            \
7560 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
7561 +                       put_local_var(lvar);                            \
7562 +               _flags = __this_cpu_read(lvar.flags);                   \
7563 +       } while (0)
7564 +
7565 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
7566 +       do {                                                            \
7567 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
7568 +               _flags = per_cpu(lvar, cpu).flags;                      \
7569 +       } while (0)
7570 +
7571 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
7572 +                                           unsigned long flags)
7573 +{
7574 +       LL_WARN(!lv->nestcnt);
7575 +       LL_WARN(lv->owner != current);
7576 +       if (--lv->nestcnt)
7577 +               return 0;
7578 +
7579 +       lv->owner = NULL;
7580 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
7581 +       return 1;
7582 +}
7583 +
7584 +#define local_unlock_irqrestore(lvar, flags)                           \
7585 +       do {                                                            \
7586 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
7587 +                       put_local_var(lvar);                            \
7588 +       } while (0)
7589 +
7590 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
7591 +       do {                                                            \
7592 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
7593 +       } while (0)
7594 +
7595 +#define local_spin_trylock_irq(lvar, lock)                             \
7596 +       ({                                                              \
7597 +               int __locked;                                           \
7598 +               local_lock_irq(lvar);                                   \
7599 +               __locked = spin_trylock(lock);                          \
7600 +               if (!__locked)                                          \
7601 +                       local_unlock_irq(lvar);                         \
7602 +               __locked;                                               \
7603 +       })
7604 +
7605 +#define local_spin_lock_irq(lvar, lock)                                        \
7606 +       do {                                                            \
7607 +               local_lock_irq(lvar);                                   \
7608 +               spin_lock(lock);                                        \
7609 +       } while (0)
7610 +
7611 +#define local_spin_unlock_irq(lvar, lock)                              \
7612 +       do {                                                            \
7613 +               spin_unlock(lock);                                      \
7614 +               local_unlock_irq(lvar);                                 \
7615 +       } while (0)
7616 +
7617 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
7618 +       do {                                                            \
7619 +               local_lock_irqsave(lvar, flags);                        \
7620 +               spin_lock(lock);                                        \
7621 +       } while (0)
7622 +
7623 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
7624 +       do {                                                            \
7625 +               spin_unlock(lock);                                      \
7626 +               local_unlock_irqrestore(lvar, flags);                   \
7627 +       } while (0)
7628 +
7629 +#define get_locked_var(lvar, var)                                      \
7630 +       (*({                                                            \
7631 +               local_lock(lvar);                                       \
7632 +               this_cpu_ptr(&var);                                     \
7633 +       }))
7634 +
7635 +#define put_locked_var(lvar, var)      local_unlock(lvar);
7636 +
7637 +#define local_lock_cpu(lvar)                                           \
7638 +       ({                                                              \
7639 +               local_lock(lvar);                                       \
7640 +               smp_processor_id();                                     \
7641 +       })
7642 +
7643 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
7644 +
7645 +#else /* PREEMPT_RT_BASE */
7646 +
7647 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
7648 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
7649 +
7650 +static inline void local_irq_lock_init(int lvar) { }
7651 +
7652 +#define local_lock(lvar)                       preempt_disable()
7653 +#define local_unlock(lvar)                     preempt_enable()
7654 +#define local_lock_irq(lvar)                   local_irq_disable()
7655 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
7656 +#define local_unlock_irq(lvar)                 local_irq_enable()
7657 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
7658 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
7659 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
7660 +
7661 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
7662 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
7663 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
7664 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
7665 +       spin_lock_irqsave(lock, flags)
7666 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
7667 +       spin_unlock_irqrestore(lock, flags)
7668 +
7669 +#define get_locked_var(lvar, var)              get_cpu_var(var)
7670 +#define put_locked_var(lvar, var)              put_cpu_var(var)
7671 +
7672 +#define local_lock_cpu(lvar)                   get_cpu()
7673 +#define local_unlock_cpu(lvar)                 put_cpu()
7674 +
7675 +#endif
7676 +
7677 +#endif
7678 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
7679 index 08d947fc4c59..705fb564a605 100644
7680 --- a/include/linux/mm_types.h
7681 +++ b/include/linux/mm_types.h
7682 @@ -11,6 +11,7 @@
7683  #include <linux/completion.h>
7684  #include <linux/cpumask.h>
7685  #include <linux/uprobes.h>
7686 +#include <linux/rcupdate.h>
7687  #include <linux/page-flags-layout.h>
7688  #include <linux/workqueue.h>
7689  #include <asm/page.h>
7690 @@ -509,6 +510,9 @@ struct mm_struct {
7691         bool tlb_flush_pending;
7692  #endif
7693         struct uprobes_state uprobes_state;
7694 +#ifdef CONFIG_PREEMPT_RT_BASE
7695 +       struct rcu_head delayed_drop;
7696 +#endif
7697  #ifdef CONFIG_X86_INTEL_MPX
7698         /* address of the bounds directory */
7699         void __user *bd_addr;
7700 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
7701 index 2cb7531e7d7a..b3fdfc820216 100644
7702 --- a/include/linux/mutex.h
7703 +++ b/include/linux/mutex.h
7704 @@ -19,6 +19,17 @@
7705  #include <asm/processor.h>
7706  #include <linux/osq_lock.h>
7707
7708 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7709 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7710 +       , .dep_map = { .name = #lockname }
7711 +#else
7712 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7713 +#endif
7714 +
7715 +#ifdef CONFIG_PREEMPT_RT_FULL
7716 +# include <linux/mutex_rt.h>
7717 +#else
7718 +
7719  /*
7720   * Simple, straightforward mutexes with strict semantics:
7721   *
7722 @@ -99,13 +110,6 @@ do {                                                        \
7723  static inline void mutex_destroy(struct mutex *lock) {}
7724  #endif
7725
7726 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
7727 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
7728 -               , .dep_map = { .name = #lockname }
7729 -#else
7730 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
7731 -#endif
7732 -
7733  #define __MUTEX_INITIALIZER(lockname) \
7734                 { .count = ATOMIC_INIT(1) \
7735                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
7736 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
7737  extern int mutex_trylock(struct mutex *lock);
7738  extern void mutex_unlock(struct mutex *lock);
7739
7740 +#endif /* !PREEMPT_RT_FULL */
7741 +
7742  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
7743
7744  #endif /* __LINUX_MUTEX_H */
7745 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
7746 new file mode 100644
7747 index 000000000000..c38a44b14da5
7748 --- /dev/null
7749 +++ b/include/linux/mutex_rt.h
7750 @@ -0,0 +1,84 @@
7751 +#ifndef __LINUX_MUTEX_RT_H
7752 +#define __LINUX_MUTEX_RT_H
7753 +
7754 +#ifndef __LINUX_MUTEX_H
7755 +#error "Please include mutex.h"
7756 +#endif
7757 +
7758 +#include <linux/rtmutex.h>
7759 +
7760 +/* FIXME: Just for __lockfunc */
7761 +#include <linux/spinlock.h>
7762 +
7763 +struct mutex {
7764 +       struct rt_mutex         lock;
7765 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7766 +       struct lockdep_map      dep_map;
7767 +#endif
7768 +};
7769 +
7770 +#define __MUTEX_INITIALIZER(mutexname)                                 \
7771 +       {                                                               \
7772 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
7773 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
7774 +       }
7775 +
7776 +#define DEFINE_MUTEX(mutexname)                                                \
7777 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
7778 +
7779 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
7780 +extern void __lockfunc _mutex_lock(struct mutex *lock);
7781 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
7782 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
7783 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
7784 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
7785 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
7786 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
7787 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
7788 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
7789 +
7790 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
7791 +#define mutex_lock(l)                  _mutex_lock(l)
7792 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
7793 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
7794 +#define mutex_trylock(l)               _mutex_trylock(l)
7795 +#define mutex_unlock(l)                        _mutex_unlock(l)
7796 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
7797 +
7798 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
7799 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
7800 +# define mutex_lock_interruptible_nested(l, s) \
7801 +                                       _mutex_lock_interruptible_nested(l, s)
7802 +# define mutex_lock_killable_nested(l, s) \
7803 +                                       _mutex_lock_killable_nested(l, s)
7804 +
7805 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
7806 +do {                                                                   \
7807 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
7808 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
7809 +} while (0)
7810 +
7811 +#else
7812 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
7813 +# define mutex_lock_interruptible_nested(l, s) \
7814 +                                       _mutex_lock_interruptible(l)
7815 +# define mutex_lock_killable_nested(l, s) \
7816 +                                       _mutex_lock_killable(l)
7817 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
7818 +#endif
7819 +
7820 +# define mutex_init(mutex)                             \
7821 +do {                                                   \
7822 +       static struct lock_class_key __key;             \
7823 +                                                       \
7824 +       rt_mutex_init(&(mutex)->lock);                  \
7825 +       __mutex_do_init((mutex), #mutex, &__key);       \
7826 +} while (0)
7827 +
7828 +# define __mutex_init(mutex, name, key)                        \
7829 +do {                                                   \
7830 +       rt_mutex_init(&(mutex)->lock);                  \
7831 +       __mutex_do_init((mutex), name, key);            \
7832 +} while (0)
7833 +
7834 +#endif
7835 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
7836 index d83590ef74a1..0ae3b6cf430c 100644
7837 --- a/include/linux/netdevice.h
7838 +++ b/include/linux/netdevice.h
7839 @@ -396,7 +396,19 @@ typedef enum rx_handler_result rx_handler_result_t;
7840  typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);
7841
7842  void __napi_schedule(struct napi_struct *n);
7843 +
7844 +/*
7845 + * When PREEMPT_RT_FULL is defined, all device interrupt handlers
7846 + * run as threads, and they can also be preempted (without PREEMPT_RT
7847 + * interrupt threads can not be preempted). Which means that calling
7848 + * __napi_schedule_irqoff() from an interrupt handler can be preempted
7849 + * and can corrupt the napi->poll_list.
7850 + */
7851 +#ifdef CONFIG_PREEMPT_RT_FULL
7852 +#define __napi_schedule_irqoff(n) __napi_schedule(n)
7853 +#else
7854  void __napi_schedule_irqoff(struct napi_struct *n);
7855 +#endif
7856
7857  static inline bool napi_disable_pending(struct napi_struct *n)
7858  {
7859 @@ -2461,14 +2473,53 @@ void netdev_freemem(struct net_device *dev);
7860  void synchronize_net(void);
7861  int init_dummy_netdev(struct net_device *dev);
7862
7863 -DECLARE_PER_CPU(int, xmit_recursion);
7864  #define XMIT_RECURSION_LIMIT   10
7865 +#ifdef CONFIG_PREEMPT_RT_FULL
7866 +static inline int dev_recursion_level(void)
7867 +{
7868 +       return current->xmit_recursion;
7869 +}
7870 +
7871 +static inline int xmit_rec_read(void)
7872 +{
7873 +       return current->xmit_recursion;
7874 +}
7875 +
7876 +static inline void xmit_rec_inc(void)
7877 +{
7878 +       current->xmit_recursion++;
7879 +}
7880 +
7881 +static inline void xmit_rec_dec(void)
7882 +{
7883 +       current->xmit_recursion--;
7884 +}
7885 +
7886 +#else
7887 +
7888 +DECLARE_PER_CPU(int, xmit_recursion);
7889
7890  static inline int dev_recursion_level(void)
7891  {
7892         return this_cpu_read(xmit_recursion);
7893  }
7894
7895 +static inline int xmit_rec_read(void)
7896 +{
7897 +       return __this_cpu_read(xmit_recursion);
7898 +}
7899 +
7900 +static inline void xmit_rec_inc(void)
7901 +{
7902 +       __this_cpu_inc(xmit_recursion);
7903 +}
7904 +
7905 +static inline void xmit_rec_dec(void)
7906 +{
7907 +       __this_cpu_dec(xmit_recursion);
7908 +}
7909 +#endif
7910 +
7911  struct net_device *dev_get_by_index(struct net *net, int ifindex);
7912  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
7913  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
7914 @@ -2851,6 +2902,7 @@ struct softnet_data {
7915         unsigned int            dropped;
7916         struct sk_buff_head     input_pkt_queue;
7917         struct napi_struct      backlog;
7918 +       struct sk_buff_head     tofree_queue;
7919
7920  };
7921
7922 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
7923 index 2ad1a2b289b5..b4d10155af54 100644
7924 --- a/include/linux/netfilter/x_tables.h
7925 +++ b/include/linux/netfilter/x_tables.h
7926 @@ -4,6 +4,7 @@
7927
7928  #include <linux/netdevice.h>
7929  #include <linux/static_key.h>
7930 +#include <linux/locallock.h>
7931  #include <uapi/linux/netfilter/x_tables.h>
7932
7933  /* Test a struct->invflags and a boolean for inequality */
7934 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
7935   */
7936  DECLARE_PER_CPU(seqcount_t, xt_recseq);
7937
7938 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
7939 +
7940  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
7941   *
7942   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
7943 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
7944  {
7945         unsigned int addend;
7946
7947 +       /* RT protection */
7948 +       local_lock(xt_write_lock);
7949 +
7950         /*
7951          * Low order bit of sequence is set if we already
7952          * called xt_write_recseq_begin().
7953 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
7954         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
7955         smp_wmb();
7956         __this_cpu_add(xt_recseq.sequence, addend);
7957 +       local_unlock(xt_write_lock);
7958  }
7959
7960  /*
7961 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
7962 index 810124b33327..d54ca43d571f 100644
7963 --- a/include/linux/nfs_fs.h
7964 +++ b/include/linux/nfs_fs.h
7965 @@ -165,7 +165,11 @@ struct nfs_inode {
7966
7967         /* Readers: in-flight sillydelete RPC calls */
7968         /* Writers: rmdir */
7969 +#ifdef CONFIG_PREEMPT_RT_BASE
7970 +       struct semaphore        rmdir_sem;
7971 +#else
7972         struct rw_semaphore     rmdir_sem;
7973 +#endif
7974
7975  #if IS_ENABLED(CONFIG_NFS_V4)
7976         struct nfs4_cached_acl  *nfs4_acl;
7977 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
7978 index beb1e10f446e..ebaf2e7bfe29 100644
7979 --- a/include/linux/nfs_xdr.h
7980 +++ b/include/linux/nfs_xdr.h
7981 @@ -1490,7 +1490,7 @@ struct nfs_unlinkdata {
7982         struct nfs_removeargs args;
7983         struct nfs_removeres res;
7984         struct dentry *dentry;
7985 -       wait_queue_head_t wq;
7986 +       struct swait_queue_head wq;
7987         struct rpc_cred *cred;
7988         struct nfs_fattr dir_attr;
7989         long timeout;
7990 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
7991 index 4149868de4e6..babe5b9bcb91 100644
7992 --- a/include/linux/notifier.h
7993 +++ b/include/linux/notifier.h
7994 @@ -6,7 +6,7 @@
7995   *
7996   *                             Alan Cox <Alan.Cox@linux.org>
7997   */
7998 -
7999 +
8000  #ifndef _LINUX_NOTIFIER_H
8001  #define _LINUX_NOTIFIER_H
8002  #include <linux/errno.h>
8003 @@ -42,9 +42,7 @@
8004   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
8005   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
8006   * SRCU notifier chains should be used when the chain will be called very
8007 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
8008 - * chains are slightly more difficult to use because they require special
8009 - * runtime initialization.
8010 + * often but notifier_blocks will seldom be removed.
8011   */
8012
8013  struct notifier_block;
8014 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
8015                 (name)->head = NULL;            \
8016         } while (0)
8017
8018 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
8019 +/* srcu_notifier_heads must be cleaned up dynamically */
8020  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8021  #define srcu_cleanup_notifier_head(name)       \
8022                 cleanup_srcu_struct(&(name)->srcu);
8023 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8024                 .head = NULL }
8025  #define RAW_NOTIFIER_INIT(name)        {                               \
8026                 .head = NULL }
8027 -/* srcu_notifier_heads cannot be initialized statically */
8028 +
8029 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
8030 +       {                                                       \
8031 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
8032 +               .head = NULL,                                   \
8033 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
8034 +       }
8035
8036  #define ATOMIC_NOTIFIER_HEAD(name)                             \
8037         struct atomic_notifier_head name =                      \
8038 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
8039         struct raw_notifier_head name =                         \
8040                 RAW_NOTIFIER_INIT(name)
8041
8042 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
8043 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
8044 +                       name##_head_srcu_array);                \
8045 +       mod struct srcu_notifier_head name =                    \
8046 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
8047 +
8048 +#define SRCU_NOTIFIER_HEAD(name)                               \
8049 +       _SRCU_NOTIFIER_HEAD(name, )
8050 +
8051 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
8052 +       _SRCU_NOTIFIER_HEAD(name, static)
8053 +
8054  #ifdef __KERNEL__
8055
8056  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
8057 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
8058
8059  /*
8060   *     Declared notifiers so far. I can imagine quite a few more chains
8061 - *     over time (eg laptop power reset chains, reboot chain (to clean
8062 + *     over time (eg laptop power reset chains, reboot chain (to clean
8063   *     device units up), device [un]mount chain, module load/unload chain,
8064 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
8065 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
8066   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
8067   */
8068 -
8069 +
8070  /* CPU notfiers are defined in include/linux/cpu.h. */
8071
8072  /* netdevice notifiers are defined in include/linux/netdevice.h */
8073 diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
8074 index 5b2e6159b744..ea940f451606 100644
8075 --- a/include/linux/percpu-rwsem.h
8076 +++ b/include/linux/percpu-rwsem.h
8077 @@ -4,7 +4,7 @@
8078  #include <linux/atomic.h>
8079  #include <linux/rwsem.h>
8080  #include <linux/percpu.h>
8081 -#include <linux/wait.h>
8082 +#include <linux/swait.h>
8083  #include <linux/rcu_sync.h>
8084  #include <linux/lockdep.h>
8085
8086 @@ -12,7 +12,7 @@ struct percpu_rw_semaphore {
8087         struct rcu_sync         rss;
8088         unsigned int __percpu   *read_count;
8089         struct rw_semaphore     rw_sem;
8090 -       wait_queue_head_t       writer;
8091 +       struct swait_queue_head writer;
8092         int                     readers_block;
8093  };
8094
8095 @@ -22,13 +22,13 @@ static struct percpu_rw_semaphore name = {                          \
8096         .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),        \
8097         .read_count = &__percpu_rwsem_rc_##name,                        \
8098         .rw_sem = __RWSEM_INITIALIZER(name.rw_sem),                     \
8099 -       .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),           \
8100 +       .writer = __SWAIT_QUEUE_HEAD_INITIALIZER(name.writer),          \
8101  }
8102
8103  extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
8104  extern void __percpu_up_read(struct percpu_rw_semaphore *);
8105
8106 -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem)
8107 +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8108  {
8109         might_sleep();
8110
8111 @@ -46,16 +46,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *
8112         __this_cpu_inc(*sem->read_count);
8113         if (unlikely(!rcu_sync_is_idle(&sem->rss)))
8114                 __percpu_down_read(sem, false); /* Unconditional memory barrier */
8115 -       barrier();
8116         /*
8117 -        * The barrier() prevents the compiler from
8118 +        * The preempt_enable() prevents the compiler from
8119          * bleeding the critical section out.
8120          */
8121 -}
8122 -
8123 -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
8124 -{
8125 -       percpu_down_read_preempt_disable(sem);
8126         preempt_enable();
8127  }
8128
8129 @@ -82,13 +76,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
8130         return ret;
8131  }
8132
8133 -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem)
8134 +static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8135  {
8136 -       /*
8137 -        * The barrier() prevents the compiler from
8138 -        * bleeding the critical section out.
8139 -        */
8140 -       barrier();
8141 +       preempt_disable();
8142         /*
8143          * Same as in percpu_down_read().
8144          */
8145 @@ -101,12 +91,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem
8146         rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
8147  }
8148
8149 -static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
8150 -{
8151 -       preempt_disable();
8152 -       percpu_up_read_preempt_enable(sem);
8153 -}
8154 -
8155  extern void percpu_down_write(struct percpu_rw_semaphore *);
8156  extern void percpu_up_write(struct percpu_rw_semaphore *);
8157
8158 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
8159 index 56939d3f6e53..1c7e33fc83e4 100644
8160 --- a/include/linux/percpu.h
8161 +++ b/include/linux/percpu.h
8162 @@ -18,6 +18,35 @@
8163  #define PERCPU_MODULE_RESERVE          0
8164  #endif
8165
8166 +#ifdef CONFIG_PREEMPT_RT_FULL
8167 +
8168 +#define get_local_var(var) (*({        \
8169 +       migrate_disable();      \
8170 +       this_cpu_ptr(&var);     }))
8171 +
8172 +#define put_local_var(var) do {        \
8173 +       (void)&(var);           \
8174 +       migrate_enable();       \
8175 +} while (0)
8176 +
8177 +# define get_local_ptr(var) ({ \
8178 +       migrate_disable();      \
8179 +       this_cpu_ptr(var);      })
8180 +
8181 +# define put_local_ptr(var) do {       \
8182 +       (void)(var);                    \
8183 +       migrate_enable();               \
8184 +} while (0)
8185 +
8186 +#else
8187 +
8188 +#define get_local_var(var)     get_cpu_var(var)
8189 +#define put_local_var(var)     put_cpu_var(var)
8190 +#define get_local_ptr(var)     get_cpu_ptr(var)
8191 +#define put_local_ptr(var)     put_cpu_ptr(var)
8192 +
8193 +#endif
8194 +
8195  /* minimum unit size, also is the maximum supported allocation size */
8196  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
8197
8198 diff --git a/include/linux/pid.h b/include/linux/pid.h
8199 index 23705a53abba..2cc64b779f03 100644
8200 --- a/include/linux/pid.h
8201 +++ b/include/linux/pid.h
8202 @@ -2,6 +2,7 @@
8203  #define _LINUX_PID_H
8204
8205  #include <linux/rcupdate.h>
8206 +#include <linux/atomic.h>
8207
8208  enum pid_type
8209  {
8210 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
8211 index 75e4e30677f1..1cfb1cb72354 100644
8212 --- a/include/linux/preempt.h
8213 +++ b/include/linux/preempt.h
8214 @@ -50,7 +50,11 @@
8215  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
8216  #define NMI_OFFSET     (1UL << NMI_SHIFT)
8217
8218 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
8219 +#ifndef CONFIG_PREEMPT_RT_FULL
8220 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
8221 +#else
8222 +# define SOFTIRQ_DISABLE_OFFSET                (0)
8223 +#endif
8224
8225  /* We use the MSB mostly because its available */
8226  #define PREEMPT_NEED_RESCHED   0x80000000
8227 @@ -59,9 +63,15 @@
8228  #include <asm/preempt.h>
8229
8230  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
8231 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
8232  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
8233                                  | NMI_MASK))
8234 +#ifndef CONFIG_PREEMPT_RT_FULL
8235 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
8236 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
8237 +#else
8238 +# define softirq_count()       (0UL)
8239 +extern int in_serving_softirq(void);
8240 +#endif
8241
8242  /*
8243   * Are we doing bottom half or hardware interrupt processing?
8244 @@ -72,7 +82,6 @@
8245  #define in_irq()               (hardirq_count())
8246  #define in_softirq()           (softirq_count())
8247  #define in_interrupt()         (irq_count())
8248 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
8249
8250  /*
8251   * Are we in NMI context?
8252 @@ -91,7 +100,11 @@
8253  /*
8254   * The preempt_count offset after spin_lock()
8255   */
8256 +#if !defined(CONFIG_PREEMPT_RT_FULL)
8257  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
8258 +#else
8259 +#define PREEMPT_LOCK_OFFSET    0
8260 +#endif
8261
8262  /*
8263   * The preempt_count offset needed for things like:
8264 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
8265  #define preempt_count_inc() preempt_count_add(1)
8266  #define preempt_count_dec() preempt_count_sub(1)
8267
8268 +#ifdef CONFIG_PREEMPT_LAZY
8269 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
8270 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
8271 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
8272 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
8273 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
8274 +#else
8275 +#define add_preempt_lazy_count(val)    do { } while (0)
8276 +#define sub_preempt_lazy_count(val)    do { } while (0)
8277 +#define inc_preempt_lazy_count()       do { } while (0)
8278 +#define dec_preempt_lazy_count()       do { } while (0)
8279 +#define preempt_lazy_count()           (0)
8280 +#endif
8281 +
8282  #ifdef CONFIG_PREEMPT_COUNT
8283
8284  #define preempt_disable() \
8285 @@ -148,13 +175,25 @@ do { \
8286         barrier(); \
8287  } while (0)
8288
8289 +#define preempt_lazy_disable() \
8290 +do { \
8291 +       inc_preempt_lazy_count(); \
8292 +       barrier(); \
8293 +} while (0)
8294 +
8295  #define sched_preempt_enable_no_resched() \
8296  do { \
8297         barrier(); \
8298         preempt_count_dec(); \
8299  } while (0)
8300
8301 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8302 +#ifdef CONFIG_PREEMPT_RT_BASE
8303 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
8304 +# define preempt_check_resched_rt() preempt_check_resched()
8305 +#else
8306 +# define preempt_enable_no_resched() preempt_enable()
8307 +# define preempt_check_resched_rt() barrier();
8308 +#endif
8309
8310  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
8311
8312 @@ -179,6 +218,13 @@ do { \
8313                 __preempt_schedule(); \
8314  } while (0)
8315
8316 +#define preempt_lazy_enable() \
8317 +do { \
8318 +       dec_preempt_lazy_count(); \
8319 +       barrier(); \
8320 +       preempt_check_resched(); \
8321 +} while (0)
8322 +
8323  #else /* !CONFIG_PREEMPT */
8324  #define preempt_enable() \
8325  do { \
8326 @@ -224,6 +270,7 @@ do { \
8327  #define preempt_disable_notrace()              barrier()
8328  #define preempt_enable_no_resched_notrace()    barrier()
8329  #define preempt_enable_notrace()               barrier()
8330 +#define preempt_check_resched_rt()             barrier()
8331  #define preemptible()                          0
8332
8333  #endif /* CONFIG_PREEMPT_COUNT */
8334 @@ -244,10 +291,31 @@ do { \
8335  } while (0)
8336  #define preempt_fold_need_resched() \
8337  do { \
8338 -       if (tif_need_resched()) \
8339 +       if (tif_need_resched_now()) \
8340                 set_preempt_need_resched(); \
8341  } while (0)
8342
8343 +#ifdef CONFIG_PREEMPT_RT_FULL
8344 +# define preempt_disable_rt()          preempt_disable()
8345 +# define preempt_enable_rt()           preempt_enable()
8346 +# define preempt_disable_nort()                barrier()
8347 +# define preempt_enable_nort()         barrier()
8348 +# ifdef CONFIG_SMP
8349 +   extern void migrate_disable(void);
8350 +   extern void migrate_enable(void);
8351 +# else /* CONFIG_SMP */
8352 +#  define migrate_disable()            barrier()
8353 +#  define migrate_enable()             barrier()
8354 +# endif /* CONFIG_SMP */
8355 +#else
8356 +# define preempt_disable_rt()          barrier()
8357 +# define preempt_enable_rt()           barrier()
8358 +# define preempt_disable_nort()                preempt_disable()
8359 +# define preempt_enable_nort()         preempt_enable()
8360 +# define migrate_disable()             preempt_disable()
8361 +# define migrate_enable()              preempt_enable()
8362 +#endif
8363 +
8364  #ifdef CONFIG_PREEMPT_NOTIFIERS
8365
8366  struct preempt_notifier;
8367 diff --git a/include/linux/printk.h b/include/linux/printk.h
8368 index eac1af8502bb..37e647af0b0b 100644
8369 --- a/include/linux/printk.h
8370 +++ b/include/linux/printk.h
8371 @@ -126,9 +126,11 @@ struct va_format {
8372  #ifdef CONFIG_EARLY_PRINTK
8373  extern asmlinkage __printf(1, 2)
8374  void early_printk(const char *fmt, ...);
8375 +extern void printk_kill(void);
8376  #else
8377  static inline __printf(1, 2) __cold
8378  void early_printk(const char *s, ...) { }
8379 +static inline void printk_kill(void) { }
8380  #endif
8381
8382  #ifdef CONFIG_PRINTK_NMI
8383 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
8384 index af3581b8a451..277295039c8f 100644
8385 --- a/include/linux/radix-tree.h
8386 +++ b/include/linux/radix-tree.h
8387 @@ -292,6 +292,8 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
8388  int radix_tree_preload(gfp_t gfp_mask);
8389  int radix_tree_maybe_preload(gfp_t gfp_mask);
8390  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
8391 +void radix_tree_preload_end(void);
8392 +
8393  void radix_tree_init(void);
8394  void *radix_tree_tag_set(struct radix_tree_root *root,
8395                         unsigned long index, unsigned int tag);
8396 @@ -314,11 +316,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
8397  int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
8398  unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
8399
8400 -static inline void radix_tree_preload_end(void)
8401 -{
8402 -       preempt_enable();
8403 -}
8404 -
8405  /**
8406   * struct radix_tree_iter - radix tree iterator state
8407   *
8408 diff --git a/include/linux/random.h b/include/linux/random.h
8409 index 7bd2403e4fef..b2df7148a42b 100644
8410 --- a/include/linux/random.h
8411 +++ b/include/linux/random.h
8412 @@ -31,7 +31,7 @@ static inline void add_latent_entropy(void) {}
8413
8414  extern void add_input_randomness(unsigned int type, unsigned int code,
8415                                  unsigned int value) __latent_entropy;
8416 -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy;
8417 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy;
8418
8419  extern void get_random_bytes(void *buf, int nbytes);
8420  extern int add_random_ready_callback(struct random_ready_callback *rdy);
8421 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
8422 index e585018498d5..25c64474fc27 100644
8423 --- a/include/linux/rbtree.h
8424 +++ b/include/linux/rbtree.h
8425 @@ -31,7 +31,7 @@
8426
8427  #include <linux/kernel.h>
8428  #include <linux/stddef.h>
8429 -#include <linux/rcupdate.h>
8430 +#include <linux/rcu_assign_pointer.h>
8431
8432  struct rb_node {
8433         unsigned long  __rb_parent_color;
8434 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
8435 index d076183e49be..36bfb4dd57ae 100644
8436 --- a/include/linux/rbtree_augmented.h
8437 +++ b/include/linux/rbtree_augmented.h
8438 @@ -26,6 +26,7 @@
8439
8440  #include <linux/compiler.h>
8441  #include <linux/rbtree.h>
8442 +#include <linux/rcupdate.h>
8443
8444  /*
8445   * Please note - only struct rb_augment_callbacks and the prototypes for
8446 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
8447 new file mode 100644
8448 index 000000000000..7066962a4379
8449 --- /dev/null
8450 +++ b/include/linux/rcu_assign_pointer.h
8451 @@ -0,0 +1,54 @@
8452 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
8453 +#define __LINUX_RCU_ASSIGN_POINTER_H__
8454 +#include <linux/compiler.h>
8455 +#include <asm/barrier.h>
8456 +
8457 +/**
8458 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8459 + * @v: The value to statically initialize with.
8460 + */
8461 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8462 +
8463 +/**
8464 + * rcu_assign_pointer() - assign to RCU-protected pointer
8465 + * @p: pointer to assign to
8466 + * @v: value to assign (publish)
8467 + *
8468 + * Assigns the specified value to the specified RCU-protected
8469 + * pointer, ensuring that any concurrent RCU readers will see
8470 + * any prior initialization.
8471 + *
8472 + * Inserts memory barriers on architectures that require them
8473 + * (which is most of them), and also prevents the compiler from
8474 + * reordering the code that initializes the structure after the pointer
8475 + * assignment.  More importantly, this call documents which pointers
8476 + * will be dereferenced by RCU read-side code.
8477 + *
8478 + * In some special cases, you may use RCU_INIT_POINTER() instead
8479 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8480 + * to the fact that it does not constrain either the CPU or the compiler.
8481 + * That said, using RCU_INIT_POINTER() when you should have used
8482 + * rcu_assign_pointer() is a very bad thing that results in
8483 + * impossible-to-diagnose memory corruption.  So please be careful.
8484 + * See the RCU_INIT_POINTER() comment header for details.
8485 + *
8486 + * Note that rcu_assign_pointer() evaluates each of its arguments only
8487 + * once, appearances notwithstanding.  One of the "extra" evaluations
8488 + * is in typeof() and the other visible only to sparse (__CHECKER__),
8489 + * neither of which actually execute the argument.  As with most cpp
8490 + * macros, this execute-arguments-only-once property is important, so
8491 + * please be careful when making changes to rcu_assign_pointer() and the
8492 + * other macros that it invokes.
8493 + */
8494 +#define rcu_assign_pointer(p, v)                                             \
8495 +({                                                                           \
8496 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8497 +                                                                             \
8498 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8499 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8500 +       else                                                                  \
8501 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8502 +       _r_a_p__v;                                                            \
8503 +})
8504 +
8505 +#endif
8506 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
8507 index 01f71e1d2e94..30cc001d0d5a 100644
8508 --- a/include/linux/rcupdate.h
8509 +++ b/include/linux/rcupdate.h
8510 @@ -46,6 +46,7 @@
8511  #include <linux/compiler.h>
8512  #include <linux/ktime.h>
8513  #include <linux/irqflags.h>
8514 +#include <linux/rcu_assign_pointer.h>
8515
8516  #include <asm/barrier.h>
8517
8518 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
8519
8520  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8521
8522 +#ifdef CONFIG_PREEMPT_RT_FULL
8523 +#define call_rcu_bh    call_rcu
8524 +#else
8525  /**
8526   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
8527   * @head: structure to be used for queueing the RCU updates.
8528 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
8529   */
8530  void call_rcu_bh(struct rcu_head *head,
8531                  rcu_callback_t func);
8532 +#endif
8533
8534  /**
8535   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
8536 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
8537   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
8538   */
8539  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
8540 +#ifndef CONFIG_PREEMPT_RT_FULL
8541 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8542 +#else
8543 +static inline int sched_rcu_preempt_depth(void) { return 0; }
8544 +#endif
8545
8546  #else /* #ifdef CONFIG_PREEMPT_RCU */
8547
8548 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
8549         return 0;
8550  }
8551
8552 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
8553 +
8554  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
8555
8556  /* Internal to kernel */
8557 @@ -505,7 +517,14 @@ extern struct lockdep_map rcu_callback_map;
8558  int debug_lockdep_rcu_enabled(void);
8559
8560  int rcu_read_lock_held(void);
8561 +#ifdef CONFIG_PREEMPT_RT_FULL
8562 +static inline int rcu_read_lock_bh_held(void)
8563 +{
8564 +       return rcu_read_lock_held();
8565 +}
8566 +#else
8567  int rcu_read_lock_bh_held(void);
8568 +#endif
8569
8570  /**
8571   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
8572 @@ -626,54 +645,6 @@ static inline void rcu_preempt_sleep_check(void)
8573  })
8574
8575  /**
8576 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
8577 - * @v: The value to statically initialize with.
8578 - */
8579 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
8580 -
8581 -/**
8582 - * rcu_assign_pointer() - assign to RCU-protected pointer
8583 - * @p: pointer to assign to
8584 - * @v: value to assign (publish)
8585 - *
8586 - * Assigns the specified value to the specified RCU-protected
8587 - * pointer, ensuring that any concurrent RCU readers will see
8588 - * any prior initialization.
8589 - *
8590 - * Inserts memory barriers on architectures that require them
8591 - * (which is most of them), and also prevents the compiler from
8592 - * reordering the code that initializes the structure after the pointer
8593 - * assignment.  More importantly, this call documents which pointers
8594 - * will be dereferenced by RCU read-side code.
8595 - *
8596 - * In some special cases, you may use RCU_INIT_POINTER() instead
8597 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
8598 - * to the fact that it does not constrain either the CPU or the compiler.
8599 - * That said, using RCU_INIT_POINTER() when you should have used
8600 - * rcu_assign_pointer() is a very bad thing that results in
8601 - * impossible-to-diagnose memory corruption.  So please be careful.
8602 - * See the RCU_INIT_POINTER() comment header for details.
8603 - *
8604 - * Note that rcu_assign_pointer() evaluates each of its arguments only
8605 - * once, appearances notwithstanding.  One of the "extra" evaluations
8606 - * is in typeof() and the other visible only to sparse (__CHECKER__),
8607 - * neither of which actually execute the argument.  As with most cpp
8608 - * macros, this execute-arguments-only-once property is important, so
8609 - * please be careful when making changes to rcu_assign_pointer() and the
8610 - * other macros that it invokes.
8611 - */
8612 -#define rcu_assign_pointer(p, v)                                             \
8613 -({                                                                           \
8614 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
8615 -                                                                             \
8616 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
8617 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
8618 -       else                                                                  \
8619 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
8620 -       _r_a_p__v;                                                            \
8621 -})
8622 -
8623 -/**
8624   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
8625   * @p: The pointer to read
8626   *
8627 @@ -951,10 +922,14 @@ static inline void rcu_read_unlock(void)
8628  static inline void rcu_read_lock_bh(void)
8629  {
8630         local_bh_disable();
8631 +#ifdef CONFIG_PREEMPT_RT_FULL
8632 +       rcu_read_lock();
8633 +#else
8634         __acquire(RCU_BH);
8635         rcu_lock_acquire(&rcu_bh_lock_map);
8636         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8637                          "rcu_read_lock_bh() used illegally while idle");
8638 +#endif
8639  }
8640
8641  /*
8642 @@ -964,10 +939,14 @@ static inline void rcu_read_lock_bh(void)
8643   */
8644  static inline void rcu_read_unlock_bh(void)
8645  {
8646 +#ifdef CONFIG_PREEMPT_RT_FULL
8647 +       rcu_read_unlock();
8648 +#else
8649         RCU_LOCKDEP_WARN(!rcu_is_watching(),
8650                          "rcu_read_unlock_bh() used illegally while idle");
8651         rcu_lock_release(&rcu_bh_lock_map);
8652         __release(RCU_BH);
8653 +#endif
8654         local_bh_enable();
8655  }
8656
8657 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
8658 index 63a4e4cf40a5..08ab12df2863 100644
8659 --- a/include/linux/rcutree.h
8660 +++ b/include/linux/rcutree.h
8661 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
8662         rcu_note_context_switch();
8663  }
8664
8665 +#ifdef CONFIG_PREEMPT_RT_FULL
8666 +# define synchronize_rcu_bh    synchronize_rcu
8667 +#else
8668  void synchronize_rcu_bh(void);
8669 +#endif
8670  void synchronize_sched_expedited(void);
8671  void synchronize_rcu_expedited(void);
8672
8673 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
8674  }
8675
8676  void rcu_barrier(void);
8677 +#ifdef CONFIG_PREEMPT_RT_FULL
8678 +# define rcu_barrier_bh                rcu_barrier
8679 +#else
8680  void rcu_barrier_bh(void);
8681 +#endif
8682  void rcu_barrier_sched(void);
8683  unsigned long get_state_synchronize_rcu(void);
8684  void cond_synchronize_rcu(unsigned long oldstate);
8685 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
8686  extern unsigned long rcutorture_testseq;
8687  extern unsigned long rcutorture_vernum;
8688  unsigned long rcu_batches_started(void);
8689 -unsigned long rcu_batches_started_bh(void);
8690  unsigned long rcu_batches_started_sched(void);
8691  unsigned long rcu_batches_completed(void);
8692 -unsigned long rcu_batches_completed_bh(void);
8693  unsigned long rcu_batches_completed_sched(void);
8694  unsigned long rcu_exp_batches_completed(void);
8695  unsigned long rcu_exp_batches_completed_sched(void);
8696  void show_rcu_gp_kthreads(void);
8697
8698  void rcu_force_quiescent_state(void);
8699 -void rcu_bh_force_quiescent_state(void);
8700  void rcu_sched_force_quiescent_state(void);
8701
8702  void rcu_idle_enter(void);
8703 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
8704
8705  bool rcu_is_watching(void);
8706
8707 +#ifndef CONFIG_PREEMPT_RT_FULL
8708 +void rcu_bh_force_quiescent_state(void);
8709 +unsigned long rcu_batches_started_bh(void);
8710 +unsigned long rcu_batches_completed_bh(void);
8711 +#else
8712 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
8713 +# define rcu_batches_completed_bh      rcu_batches_completed
8714 +# define rcu_batches_started_bh                rcu_batches_completed
8715 +#endif
8716 +
8717  void rcu_all_qs(void);
8718
8719  /* RCUtree hotplug events */
8720 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
8721 index 1abba5ce2a2f..30211c627511 100644
8722 --- a/include/linux/rtmutex.h
8723 +++ b/include/linux/rtmutex.h
8724 @@ -13,11 +13,15 @@
8725  #define __LINUX_RT_MUTEX_H
8726
8727  #include <linux/linkage.h>
8728 +#include <linux/spinlock_types_raw.h>
8729  #include <linux/rbtree.h>
8730 -#include <linux/spinlock_types.h>
8731
8732  extern int max_lock_depth; /* for sysctl */
8733
8734 +#ifdef CONFIG_DEBUG_MUTEXES
8735 +#include <linux/debug_locks.h>
8736 +#endif
8737 +
8738  /**
8739   * The rt_mutex structure
8740   *
8741 @@ -31,8 +35,8 @@ struct rt_mutex {
8742         struct rb_root          waiters;
8743         struct rb_node          *waiters_leftmost;
8744         struct task_struct      *owner;
8745 -#ifdef CONFIG_DEBUG_RT_MUTEXES
8746         int                     save_state;
8747 +#ifdef CONFIG_DEBUG_RT_MUTEXES
8748         const char              *name, *file;
8749         int                     line;
8750         void                    *magic;
8751 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
8752  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
8753  #endif
8754
8755 +# define rt_mutex_init(mutex)                                  \
8756 +       do {                                                    \
8757 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
8758 +               __rt_mutex_init(mutex, #mutex);                 \
8759 +       } while (0)
8760 +
8761  #ifdef CONFIG_DEBUG_RT_MUTEXES
8762  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
8763         , .name = #mutexname, .file = __FILE__, .line = __LINE__
8764 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
8765   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
8766  #else
8767  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8768 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
8769  # define rt_mutex_debug_task_free(t)                   do { } while (0)
8770  #endif
8771
8772 -#define __RT_MUTEX_INITIALIZER(mutexname) \
8773 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8774 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
8775 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
8776         , .waiters = RB_ROOT \
8777         , .owner = NULL \
8778 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
8779 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
8780 +
8781 +#define __RT_MUTEX_INITIALIZER(mutexname) \
8782 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
8783 +
8784 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
8785 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
8786 +       , .save_state = 1 }
8787
8788  #define DEFINE_RT_MUTEX(mutexname) \
8789         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
8790 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
8791
8792  extern void rt_mutex_lock(struct rt_mutex *lock);
8793  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
8794 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
8795  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
8796                                struct hrtimer_sleeper *timeout);
8797
8798 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
8799 new file mode 100644
8800 index 000000000000..49ed2d45d3be
8801 --- /dev/null
8802 +++ b/include/linux/rwlock_rt.h
8803 @@ -0,0 +1,99 @@
8804 +#ifndef __LINUX_RWLOCK_RT_H
8805 +#define __LINUX_RWLOCK_RT_H
8806 +
8807 +#ifndef __LINUX_SPINLOCK_H
8808 +#error Do not include directly. Use spinlock.h
8809 +#endif
8810 +
8811 +#define rwlock_init(rwl)                               \
8812 +do {                                                   \
8813 +       static struct lock_class_key __key;             \
8814 +                                                       \
8815 +       rt_mutex_init(&(rwl)->lock);                    \
8816 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
8817 +} while (0)
8818 +
8819 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
8820 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
8821 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
8822 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
8823 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
8824 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
8825 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
8826 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
8827 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
8828 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
8829 +
8830 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
8831 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
8832 +
8833 +#define write_trylock_irqsave(lock, flags)     \
8834 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
8835 +
8836 +#define read_lock_irqsave(lock, flags)                 \
8837 +       do {                                            \
8838 +               typecheck(unsigned long, flags);        \
8839 +               flags = rt_read_lock_irqsave(lock);     \
8840 +       } while (0)
8841 +
8842 +#define write_lock_irqsave(lock, flags)                        \
8843 +       do {                                            \
8844 +               typecheck(unsigned long, flags);        \
8845 +               flags = rt_write_lock_irqsave(lock);    \
8846 +       } while (0)
8847 +
8848 +#define read_lock(lock)                rt_read_lock(lock)
8849 +
8850 +#define read_lock_bh(lock)                             \
8851 +       do {                                            \
8852 +               local_bh_disable();                     \
8853 +               rt_read_lock(lock);                     \
8854 +       } while (0)
8855 +
8856 +#define read_lock_irq(lock)    read_lock(lock)
8857 +
8858 +#define write_lock(lock)       rt_write_lock(lock)
8859 +
8860 +#define write_lock_bh(lock)                            \
8861 +       do {                                            \
8862 +               local_bh_disable();                     \
8863 +               rt_write_lock(lock);                    \
8864 +       } while (0)
8865 +
8866 +#define write_lock_irq(lock)   write_lock(lock)
8867 +
8868 +#define read_unlock(lock)      rt_read_unlock(lock)
8869 +
8870 +#define read_unlock_bh(lock)                           \
8871 +       do {                                            \
8872 +               rt_read_unlock(lock);                   \
8873 +               local_bh_enable();                      \
8874 +       } while (0)
8875 +
8876 +#define read_unlock_irq(lock)  read_unlock(lock)
8877 +
8878 +#define write_unlock(lock)     rt_write_unlock(lock)
8879 +
8880 +#define write_unlock_bh(lock)                          \
8881 +       do {                                            \
8882 +               rt_write_unlock(lock);                  \
8883 +               local_bh_enable();                      \
8884 +       } while (0)
8885 +
8886 +#define write_unlock_irq(lock) write_unlock(lock)
8887 +
8888 +#define read_unlock_irqrestore(lock, flags)            \
8889 +       do {                                            \
8890 +               typecheck(unsigned long, flags);        \
8891 +               (void) flags;                           \
8892 +               rt_read_unlock(lock);                   \
8893 +       } while (0)
8894 +
8895 +#define write_unlock_irqrestore(lock, flags) \
8896 +       do {                                            \
8897 +               typecheck(unsigned long, flags);        \
8898 +               (void) flags;                           \
8899 +               rt_write_unlock(lock);                  \
8900 +       } while (0)
8901 +
8902 +#endif
8903 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
8904 index cc0072e93e36..5317cd957292 100644
8905 --- a/include/linux/rwlock_types.h
8906 +++ b/include/linux/rwlock_types.h
8907 @@ -1,6 +1,10 @@
8908  #ifndef __LINUX_RWLOCK_TYPES_H
8909  #define __LINUX_RWLOCK_TYPES_H
8910
8911 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
8912 +# error "Do not include directly, include spinlock_types.h"
8913 +#endif
8914 +
8915  /*
8916   * include/linux/rwlock_types.h - generic rwlock type definitions
8917   *                               and initializers
8918 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
8919 new file mode 100644
8920 index 000000000000..51b28d775fe1
8921 --- /dev/null
8922 +++ b/include/linux/rwlock_types_rt.h
8923 @@ -0,0 +1,33 @@
8924 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
8925 +#define __LINUX_RWLOCK_TYPES_RT_H
8926 +
8927 +#ifndef __LINUX_SPINLOCK_TYPES_H
8928 +#error "Do not include directly. Include spinlock_types.h instead"
8929 +#endif
8930 +
8931 +/*
8932 + * rwlocks - rtmutex which allows single reader recursion
8933 + */
8934 +typedef struct {
8935 +       struct rt_mutex         lock;
8936 +       int                     read_depth;
8937 +       unsigned int            break_lock;
8938 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8939 +       struct lockdep_map      dep_map;
8940 +#endif
8941 +} rwlock_t;
8942 +
8943 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8944 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
8945 +#else
8946 +# define RW_DEP_MAP_INIT(lockname)
8947 +#endif
8948 +
8949 +#define __RW_LOCK_UNLOCKED(name) \
8950 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
8951 +         RW_DEP_MAP_INIT(name) }
8952 +
8953 +#define DEFINE_RWLOCK(name) \
8954 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
8955 +
8956 +#endif
8957 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
8958 index dd1d14250340..8e1f44ff1f2f 100644
8959 --- a/include/linux/rwsem.h
8960 +++ b/include/linux/rwsem.h
8961 @@ -19,6 +19,10 @@
8962  #include <linux/osq_lock.h>
8963  #endif
8964
8965 +#ifdef CONFIG_PREEMPT_RT_FULL
8966 +#include <linux/rwsem_rt.h>
8967 +#else /* PREEMPT_RT_FULL */
8968 +
8969  struct rw_semaphore;
8970
8971  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
8972 @@ -184,4 +188,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
8973  # define up_read_non_owner(sem)                        up_read(sem)
8974  #endif
8975
8976 +#endif /* !PREEMPT_RT_FULL */
8977 +
8978  #endif /* _LINUX_RWSEM_H */
8979 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
8980 new file mode 100644
8981 index 000000000000..e26bd95a57c3
8982 --- /dev/null
8983 +++ b/include/linux/rwsem_rt.h
8984 @@ -0,0 +1,167 @@
8985 +#ifndef _LINUX_RWSEM_RT_H
8986 +#define _LINUX_RWSEM_RT_H
8987 +
8988 +#ifndef _LINUX_RWSEM_H
8989 +#error "Include rwsem.h"
8990 +#endif
8991 +
8992 +/*
8993 + * RW-semaphores are a spinlock plus a reader-depth count.
8994 + *
8995 + * Note that the semantics are different from the usual
8996 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
8997 + * multiple readers to hold the lock at once, we only allow
8998 + * a read-lock owner to read-lock recursively. This is
8999 + * better for latency, makes the implementation inherently
9000 + * fair and makes it simpler as well.
9001 + */
9002 +
9003 +#include <linux/rtmutex.h>
9004 +
9005 +struct rw_semaphore {
9006 +       struct rt_mutex         lock;
9007 +       int                     read_depth;
9008 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9009 +       struct lockdep_map      dep_map;
9010 +#endif
9011 +};
9012 +
9013 +#define __RWSEM_INITIALIZER(name) \
9014 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
9015 +         RW_DEP_MAP_INIT(name) }
9016 +
9017 +#define DECLARE_RWSEM(lockname) \
9018 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
9019 +
9020 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
9021 +                                    struct lock_class_key *key);
9022 +
9023 +#define __rt_init_rwsem(sem, name, key)                        \
9024 +       do {                                            \
9025 +               rt_mutex_init(&(sem)->lock);            \
9026 +               __rt_rwsem_init((sem), (name), (key));\
9027 +       } while (0)
9028 +
9029 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
9030 +
9031 +# define rt_init_rwsem(sem)                            \
9032 +do {                                                   \
9033 +       static struct lock_class_key __key;             \
9034 +                                                       \
9035 +       __rt_init_rwsem((sem), #sem, &__key);           \
9036 +} while (0)
9037 +
9038 +extern void rt_down_write(struct rw_semaphore *rwsem);
9039 +extern int  rt_down_write_killable(struct rw_semaphore *rwsem);
9040 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
9041 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
9042 +extern int  rt_down_write_killable_nested(struct rw_semaphore *rwsem,
9043 +                                         int subclass);
9044 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
9045 +                                     struct lockdep_map *nest);
9046 +extern void rt__down_read(struct rw_semaphore *rwsem);
9047 +extern void rt_down_read(struct rw_semaphore *rwsem);
9048 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
9049 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
9050 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
9051 +extern void __rt_up_read(struct rw_semaphore *rwsem);
9052 +extern void rt_up_read(struct rw_semaphore *rwsem);
9053 +extern void rt_up_write(struct rw_semaphore *rwsem);
9054 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
9055 +
9056 +#define init_rwsem(sem)                rt_init_rwsem(sem)
9057 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
9058 +
9059 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
9060 +{
9061 +       /* rt_mutex_has_waiters() */
9062 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
9063 +}
9064 +
9065 +static inline void __down_read(struct rw_semaphore *sem)
9066 +{
9067 +       rt__down_read(sem);
9068 +}
9069 +
9070 +static inline void down_read(struct rw_semaphore *sem)
9071 +{
9072 +       rt_down_read(sem);
9073 +}
9074 +
9075 +static inline int __down_read_trylock(struct rw_semaphore *sem)
9076 +{
9077 +       return rt__down_read_trylock(sem);
9078 +}
9079 +
9080 +static inline int down_read_trylock(struct rw_semaphore *sem)
9081 +{
9082 +       return rt_down_read_trylock(sem);
9083 +}
9084 +
9085 +static inline void down_write(struct rw_semaphore *sem)
9086 +{
9087 +       rt_down_write(sem);
9088 +}
9089 +
9090 +static inline int down_write_killable(struct rw_semaphore *sem)
9091 +{
9092 +       return rt_down_write_killable(sem);
9093 +}
9094 +
9095 +static inline int down_write_trylock(struct rw_semaphore *sem)
9096 +{
9097 +       return rt_down_write_trylock(sem);
9098 +}
9099 +
9100 +static inline void __up_read(struct rw_semaphore *sem)
9101 +{
9102 +       __rt_up_read(sem);
9103 +}
9104 +
9105 +static inline void up_read(struct rw_semaphore *sem)
9106 +{
9107 +       rt_up_read(sem);
9108 +}
9109 +
9110 +static inline void up_write(struct rw_semaphore *sem)
9111 +{
9112 +       rt_up_write(sem);
9113 +}
9114 +
9115 +static inline void downgrade_write(struct rw_semaphore *sem)
9116 +{
9117 +       rt_downgrade_write(sem);
9118 +}
9119 +
9120 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
9121 +{
9122 +       return rt_down_read_nested(sem, subclass);
9123 +}
9124 +
9125 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
9126 +{
9127 +       rt_down_write_nested(sem, subclass);
9128 +}
9129 +
9130 +static inline int down_write_killable_nested(struct rw_semaphore *sem,
9131 +                                            int subclass)
9132 +{
9133 +       return rt_down_write_killable_nested(sem, subclass);
9134 +}
9135 +
9136 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
9137 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
9138 +               struct rw_semaphore *nest_lock)
9139 +{
9140 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
9141 +}
9142 +
9143 +#else
9144 +
9145 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
9146 +               struct rw_semaphore *nest_lock)
9147 +{
9148 +       rt_down_write_nested_lock(sem, NULL);
9149 +}
9150 +#endif
9151 +#endif
9152 diff --git a/include/linux/sched.h b/include/linux/sched.h
9153 index 75d9a57e212e..8cb7df0f56e3 100644
9154 --- a/include/linux/sched.h
9155 +++ b/include/linux/sched.h
9156 @@ -26,6 +26,7 @@ struct sched_param {
9157  #include <linux/nodemask.h>
9158  #include <linux/mm_types.h>
9159  #include <linux/preempt.h>
9160 +#include <asm/kmap_types.h>
9161
9162  #include <asm/page.h>
9163  #include <asm/ptrace.h>
9164 @@ -243,10 +244,7 @@ extern char ___assert_task_state[1 - 2*!!(
9165                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
9166                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
9167
9168 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
9169  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
9170 -#define task_is_stopped_or_traced(task)        \
9171 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
9172  #define task_contributes_to_load(task) \
9173                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
9174                                  (task->flags & PF_FROZEN) == 0 && \
9175 @@ -312,6 +310,11 @@ extern char ___assert_task_state[1 - 2*!!(
9176
9177  #endif
9178
9179 +#define __set_current_state_no_track(state_value)      \
9180 +       do { current->state = (state_value); } while (0)
9181 +#define set_current_state_no_track(state_value)                \
9182 +       set_mb(current->state, (state_value))
9183 +
9184  /* Task command name length */
9185  #define TASK_COMM_LEN 16
9186
9187 @@ -1013,8 +1016,18 @@ struct wake_q_head {
9188         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
9189
9190  extern void wake_q_add(struct wake_q_head *head,
9191 -                      struct task_struct *task);
9192 -extern void wake_up_q(struct wake_q_head *head);
9193 +                             struct task_struct *task);
9194 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
9195 +
9196 +static inline void wake_up_q(struct wake_q_head *head)
9197 +{
9198 +       __wake_up_q(head, false);
9199 +}
9200 +
9201 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
9202 +{
9203 +       __wake_up_q(head, true);
9204 +}
9205
9206  /*
9207   * sched-domains (multiprocessor balancing) declarations:
9208 @@ -1481,6 +1494,7 @@ struct task_struct {
9209         struct thread_info thread_info;
9210  #endif
9211         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
9212 +       volatile long saved_state; /* saved state for "spinlock sleepers" */
9213         void *stack;
9214         atomic_t usage;
9215         unsigned int flags;     /* per process flags, defined below */
9216 @@ -1520,6 +1534,12 @@ struct task_struct {
9217  #endif
9218
9219         unsigned int policy;
9220 +#ifdef CONFIG_PREEMPT_RT_FULL
9221 +       int migrate_disable;
9222 +# ifdef CONFIG_SCHED_DEBUG
9223 +       int migrate_disable_atomic;
9224 +# endif
9225 +#endif
9226         int nr_cpus_allowed;
9227         cpumask_t cpus_allowed;
9228
9229 @@ -1654,6 +1674,9 @@ struct task_struct {
9230
9231         struct task_cputime cputime_expires;
9232         struct list_head cpu_timers[3];
9233 +#ifdef CONFIG_PREEMPT_RT_BASE
9234 +       struct task_struct *posix_timer_list;
9235 +#endif
9236
9237  /* process credentials */
9238         const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
9239 @@ -1685,10 +1708,15 @@ struct task_struct {
9240  /* signal handlers */
9241         struct signal_struct *signal;
9242         struct sighand_struct *sighand;
9243 +       struct sigqueue *sigqueue_cache;
9244
9245         sigset_t blocked, real_blocked;
9246         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
9247         struct sigpending pending;
9248 +#ifdef CONFIG_PREEMPT_RT_FULL
9249 +       /* TODO: move me into ->restart_block ? */
9250 +       struct siginfo forced_info;
9251 +#endif
9252
9253         unsigned long sas_ss_sp;
9254         size_t sas_ss_size;
9255 @@ -1917,6 +1945,12 @@ struct task_struct {
9256         /* bitmask and counter of trace recursion */
9257         unsigned long trace_recursion;
9258  #endif /* CONFIG_TRACING */
9259 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
9260 +       u64 preempt_timestamp_hist;
9261 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
9262 +       long timer_offset;
9263 +#endif
9264 +#endif
9265  #ifdef CONFIG_KCOV
9266         /* Coverage collection mode enabled for this task (0 if disabled). */
9267         enum kcov_mode kcov_mode;
9268 @@ -1942,9 +1976,23 @@ struct task_struct {
9269         unsigned int    sequential_io;
9270         unsigned int    sequential_io_avg;
9271  #endif
9272 +#ifdef CONFIG_PREEMPT_RT_BASE
9273 +       struct rcu_head put_rcu;
9274 +       int softirq_nestcnt;
9275 +       unsigned int softirqs_raised;
9276 +#endif
9277 +#ifdef CONFIG_PREEMPT_RT_FULL
9278 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
9279 +       int kmap_idx;
9280 +       pte_t kmap_pte[KM_TYPE_NR];
9281 +# endif
9282 +#endif
9283  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
9284         unsigned long   task_state_change;
9285  #endif
9286 +#ifdef CONFIG_PREEMPT_RT_FULL
9287 +       int xmit_recursion;
9288 +#endif
9289         int pagefault_disabled;
9290  #ifdef CONFIG_MMU
9291         struct task_struct *oom_reaper_list;
9292 @@ -1984,14 +2032,6 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
9293  }
9294  #endif
9295
9296 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
9297 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
9298 -
9299 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9300 -{
9301 -       return p->nr_cpus_allowed;
9302 -}
9303 -
9304  #define TNF_MIGRATED   0x01
9305  #define TNF_NO_GROUP   0x02
9306  #define TNF_SHARED     0x04
9307 @@ -2207,6 +2247,15 @@ extern struct pid *cad_pid;
9308  extern void free_task(struct task_struct *tsk);
9309  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
9310
9311 +#ifdef CONFIG_PREEMPT_RT_BASE
9312 +extern void __put_task_struct_cb(struct rcu_head *rhp);
9313 +
9314 +static inline void put_task_struct(struct task_struct *t)
9315 +{
9316 +       if (atomic_dec_and_test(&t->usage))
9317 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
9318 +}
9319 +#else
9320  extern void __put_task_struct(struct task_struct *t);
9321
9322  static inline void put_task_struct(struct task_struct *t)
9323 @@ -2214,6 +2263,7 @@ static inline void put_task_struct(struct task_struct *t)
9324         if (atomic_dec_and_test(&t->usage))
9325                 __put_task_struct(t);
9326  }
9327 +#endif
9328
9329  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
9330  struct task_struct *try_get_task_struct(struct task_struct **ptask);
9331 @@ -2255,6 +2305,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
9332  /*
9333   * Per process flags
9334   */
9335 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
9336  #define PF_EXITING     0x00000004      /* getting shut down */
9337  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
9338  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
9339 @@ -2423,6 +2474,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
9340
9341  extern int set_cpus_allowed_ptr(struct task_struct *p,
9342                                 const struct cpumask *new_mask);
9343 +int migrate_me(void);
9344 +void tell_sched_cpu_down_begin(int cpu);
9345 +void tell_sched_cpu_down_done(int cpu);
9346 +
9347  #else
9348  static inline void do_set_cpus_allowed(struct task_struct *p,
9349                                       const struct cpumask *new_mask)
9350 @@ -2435,6 +2490,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
9351                 return -EINVAL;
9352         return 0;
9353  }
9354 +static inline int migrate_me(void) { return 0; }
9355 +static inline void tell_sched_cpu_down_begin(int cpu) { }
9356 +static inline void tell_sched_cpu_down_done(int cpu) { }
9357  #endif
9358
9359  #ifdef CONFIG_NO_HZ_COMMON
9360 @@ -2673,6 +2731,7 @@ extern void xtime_update(unsigned long ticks);
9361
9362  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
9363  extern int wake_up_process(struct task_struct *tsk);
9364 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
9365  extern void wake_up_new_task(struct task_struct *tsk);
9366  #ifdef CONFIG_SMP
9367   extern void kick_process(struct task_struct *tsk);
9368 @@ -2881,6 +2940,17 @@ static inline void mmdrop(struct mm_struct *mm)
9369                 __mmdrop(mm);
9370  }
9371
9372 +#ifdef CONFIG_PREEMPT_RT_BASE
9373 +extern void __mmdrop_delayed(struct rcu_head *rhp);
9374 +static inline void mmdrop_delayed(struct mm_struct *mm)
9375 +{
9376 +       if (atomic_dec_and_test(&mm->mm_count))
9377 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
9378 +}
9379 +#else
9380 +# define mmdrop_delayed(mm)    mmdrop(mm)
9381 +#endif
9382 +
9383  static inline void mmdrop_async_fn(struct work_struct *work)
9384  {
9385         struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
9386 @@ -3273,6 +3343,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
9387         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
9388  }
9389
9390 +#ifdef CONFIG_PREEMPT_LAZY
9391 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
9392 +{
9393 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9394 +}
9395 +
9396 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
9397 +{
9398 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
9399 +}
9400 +
9401 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
9402 +{
9403 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
9404 +}
9405 +
9406 +static inline int need_resched_lazy(void)
9407 +{
9408 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
9409 +}
9410 +
9411 +static inline int need_resched_now(void)
9412 +{
9413 +       return test_thread_flag(TIF_NEED_RESCHED);
9414 +}
9415 +
9416 +#else
9417 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
9418 +static inline int need_resched_lazy(void) { return 0; }
9419 +
9420 +static inline int need_resched_now(void)
9421 +{
9422 +       return test_thread_flag(TIF_NEED_RESCHED);
9423 +}
9424 +
9425 +#endif
9426 +
9427  static inline int restart_syscall(void)
9428  {
9429         set_tsk_thread_flag(current, TIF_SIGPENDING);
9430 @@ -3304,6 +3411,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
9431         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
9432  }
9433
9434 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
9435 +{
9436 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
9437 +               return true;
9438 +#ifdef CONFIG_PREEMPT_RT_FULL
9439 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
9440 +               return true;
9441 +#endif
9442 +       return false;
9443 +}
9444 +
9445 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
9446 +{
9447 +       bool traced_stopped;
9448 +
9449 +#ifdef CONFIG_PREEMPT_RT_FULL
9450 +       unsigned long flags;
9451 +
9452 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
9453 +       traced_stopped = __task_is_stopped_or_traced(task);
9454 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
9455 +#else
9456 +       traced_stopped = __task_is_stopped_or_traced(task);
9457 +#endif
9458 +       return traced_stopped;
9459 +}
9460 +
9461 +static inline bool task_is_traced(struct task_struct *task)
9462 +{
9463 +       bool traced = false;
9464 +
9465 +       if (task->state & __TASK_TRACED)
9466 +               return true;
9467 +#ifdef CONFIG_PREEMPT_RT_FULL
9468 +       /* in case the task is sleeping on tasklist_lock */
9469 +       raw_spin_lock_irq(&task->pi_lock);
9470 +       if (task->state & __TASK_TRACED)
9471 +               traced = true;
9472 +       else if (task->saved_state & __TASK_TRACED)
9473 +               traced = true;
9474 +       raw_spin_unlock_irq(&task->pi_lock);
9475 +#endif
9476 +       return traced;
9477 +}
9478 +
9479  /*
9480   * cond_resched() and cond_resched_lock(): latency reduction via
9481   * explicit rescheduling in places that are safe. The return
9482 @@ -3329,12 +3481,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
9483         __cond_resched_lock(lock);                              \
9484  })
9485
9486 +#ifndef CONFIG_PREEMPT_RT_FULL
9487  extern int __cond_resched_softirq(void);
9488
9489  #define cond_resched_softirq() ({                                      \
9490         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
9491         __cond_resched_softirq();                                       \
9492  })
9493 +#else
9494 +# define cond_resched_softirq()                cond_resched()
9495 +#endif
9496
9497  static inline void cond_resched_rcu(void)
9498  {
9499 @@ -3509,6 +3665,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
9500
9501  #endif /* CONFIG_SMP */
9502
9503 +static inline int __migrate_disabled(struct task_struct *p)
9504 +{
9505 +#ifdef CONFIG_PREEMPT_RT_FULL
9506 +       return p->migrate_disable;
9507 +#else
9508 +       return 0;
9509 +#endif
9510 +}
9511 +
9512 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
9513 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
9514 +{
9515 +       if (__migrate_disabled(p))
9516 +               return cpumask_of(task_cpu(p));
9517 +
9518 +       return &p->cpus_allowed;
9519 +}
9520 +
9521 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
9522 +{
9523 +       if (__migrate_disabled(p))
9524 +               return 1;
9525 +       return p->nr_cpus_allowed;
9526 +}
9527 +
9528  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
9529  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
9530
9531 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
9532 index ead97654c4e9..3d7223ffdd3b 100644
9533 --- a/include/linux/seqlock.h
9534 +++ b/include/linux/seqlock.h
9535 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
9536         return __read_seqcount_retry(s, start);
9537  }
9538
9539 -
9540 -
9541 -static inline void raw_write_seqcount_begin(seqcount_t *s)
9542 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
9543  {
9544         s->sequence++;
9545         smp_wmb();
9546  }
9547
9548 -static inline void raw_write_seqcount_end(seqcount_t *s)
9549 +static inline void raw_write_seqcount_begin(seqcount_t *s)
9550 +{
9551 +       preempt_disable_rt();
9552 +       __raw_write_seqcount_begin(s);
9553 +}
9554 +
9555 +static inline void __raw_write_seqcount_end(seqcount_t *s)
9556  {
9557         smp_wmb();
9558         s->sequence++;
9559  }
9560
9561 +static inline void raw_write_seqcount_end(seqcount_t *s)
9562 +{
9563 +       __raw_write_seqcount_end(s);
9564 +       preempt_enable_rt();
9565 +}
9566 +
9567  /**
9568   * raw_write_seqcount_barrier - do a seq write barrier
9569   * @s: pointer to seqcount_t
9570 @@ -428,10 +438,32 @@ typedef struct {
9571  /*
9572   * Read side functions for starting and finalizing a read side section.
9573   */
9574 +#ifndef CONFIG_PREEMPT_RT_FULL
9575  static inline unsigned read_seqbegin(const seqlock_t *sl)
9576  {
9577         return read_seqcount_begin(&sl->seqcount);
9578  }
9579 +#else
9580 +/*
9581 + * Starvation safe read side for RT
9582 + */
9583 +static inline unsigned read_seqbegin(seqlock_t *sl)
9584 +{
9585 +       unsigned ret;
9586 +
9587 +repeat:
9588 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
9589 +       if (unlikely(ret & 1)) {
9590 +               /*
9591 +                * Take the lock and let the writer proceed (i.e. evtl
9592 +                * boost it), otherwise we could loop here forever.
9593 +                */
9594 +               spin_unlock_wait(&sl->lock);
9595 +               goto repeat;
9596 +       }
9597 +       return ret;
9598 +}
9599 +#endif
9600
9601  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9602  {
9603 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
9604  static inline void write_seqlock(seqlock_t *sl)
9605  {
9606         spin_lock(&sl->lock);
9607 -       write_seqcount_begin(&sl->seqcount);
9608 +       __raw_write_seqcount_begin(&sl->seqcount);
9609 +}
9610 +
9611 +static inline int try_write_seqlock(seqlock_t *sl)
9612 +{
9613 +       if (spin_trylock(&sl->lock)) {
9614 +               __raw_write_seqcount_begin(&sl->seqcount);
9615 +               return 1;
9616 +       }
9617 +       return 0;
9618  }
9619
9620  static inline void write_sequnlock(seqlock_t *sl)
9621  {
9622 -       write_seqcount_end(&sl->seqcount);
9623 +       __raw_write_seqcount_end(&sl->seqcount);
9624         spin_unlock(&sl->lock);
9625  }
9626
9627  static inline void write_seqlock_bh(seqlock_t *sl)
9628  {
9629         spin_lock_bh(&sl->lock);
9630 -       write_seqcount_begin(&sl->seqcount);
9631 +       __raw_write_seqcount_begin(&sl->seqcount);
9632  }
9633
9634  static inline void write_sequnlock_bh(seqlock_t *sl)
9635  {
9636 -       write_seqcount_end(&sl->seqcount);
9637 +       __raw_write_seqcount_end(&sl->seqcount);
9638         spin_unlock_bh(&sl->lock);
9639  }
9640
9641  static inline void write_seqlock_irq(seqlock_t *sl)
9642  {
9643         spin_lock_irq(&sl->lock);
9644 -       write_seqcount_begin(&sl->seqcount);
9645 +       __raw_write_seqcount_begin(&sl->seqcount);
9646  }
9647
9648  static inline void write_sequnlock_irq(seqlock_t *sl)
9649  {
9650 -       write_seqcount_end(&sl->seqcount);
9651 +       __raw_write_seqcount_end(&sl->seqcount);
9652         spin_unlock_irq(&sl->lock);
9653  }
9654
9655 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9656         unsigned long flags;
9657
9658         spin_lock_irqsave(&sl->lock, flags);
9659 -       write_seqcount_begin(&sl->seqcount);
9660 +       __raw_write_seqcount_begin(&sl->seqcount);
9661         return flags;
9662  }
9663
9664 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
9665  static inline void
9666  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
9667  {
9668 -       write_seqcount_end(&sl->seqcount);
9669 +       __raw_write_seqcount_end(&sl->seqcount);
9670         spin_unlock_irqrestore(&sl->lock, flags);
9671  }
9672
9673 diff --git a/include/linux/signal.h b/include/linux/signal.h
9674 index b63f63eaa39c..295540fdfc72 100644
9675 --- a/include/linux/signal.h
9676 +++ b/include/linux/signal.h
9677 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
9678  }
9679
9680  extern void flush_sigqueue(struct sigpending *queue);
9681 +extern void flush_task_sigqueue(struct task_struct *tsk);
9682
9683  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
9684  static inline int valid_signal(unsigned long sig)
9685 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
9686 index 32810f279f8e..0db6e31161f6 100644
9687 --- a/include/linux/skbuff.h
9688 +++ b/include/linux/skbuff.h
9689 @@ -284,6 +284,7 @@ struct sk_buff_head {
9690
9691         __u32           qlen;
9692         spinlock_t      lock;
9693 +       raw_spinlock_t  raw_lock;
9694  };
9695
9696  struct sk_buff;
9697 @@ -1573,6 +1574,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
9698         __skb_queue_head_init(list);
9699  }
9700
9701 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
9702 +{
9703 +       raw_spin_lock_init(&list->raw_lock);
9704 +       __skb_queue_head_init(list);
9705 +}
9706 +
9707  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
9708                 struct lock_class_key *class)
9709  {
9710 diff --git a/include/linux/smp.h b/include/linux/smp.h
9711 index 8e0cb7a0f836..b16ca967ad80 100644
9712 --- a/include/linux/smp.h
9713 +++ b/include/linux/smp.h
9714 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
9715  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
9716  #define put_cpu()              preempt_enable()
9717
9718 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
9719 +#define put_cpu_light()                migrate_enable()
9720 +
9721  /*
9722   * Callback to arch code if there's nosmp or maxcpus=0 on the
9723   * boot command line:
9724 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
9725 index 47dd0cebd204..02928fa5499d 100644
9726 --- a/include/linux/spinlock.h
9727 +++ b/include/linux/spinlock.h
9728 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9729  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
9730
9731  /* Include rwlock functions */
9732 -#include <linux/rwlock.h>
9733 +#ifdef CONFIG_PREEMPT_RT_FULL
9734 +# include <linux/rwlock_rt.h>
9735 +#else
9736 +# include <linux/rwlock.h>
9737 +#endif
9738
9739  /*
9740   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
9741 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
9742  # include <linux/spinlock_api_up.h>
9743  #endif
9744
9745 +#ifdef CONFIG_PREEMPT_RT_FULL
9746 +# include <linux/spinlock_rt.h>
9747 +#else /* PREEMPT_RT_FULL */
9748 +
9749  /*
9750   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
9751   */
9752 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
9753         raw_spin_unlock(&lock->rlock);
9754  }
9755
9756 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
9757 +{
9758 +       raw_spin_unlock(&lock->rlock);
9759 +       return 0;
9760 +}
9761 +
9762  static __always_inline void spin_unlock_bh(spinlock_t *lock)
9763  {
9764         raw_spin_unlock_bh(&lock->rlock);
9765 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
9766  #define atomic_dec_and_lock(atomic, lock) \
9767                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
9768
9769 +#endif /* !PREEMPT_RT_FULL */
9770 +
9771  #endif /* __LINUX_SPINLOCK_H */
9772 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
9773 index 5344268e6e62..043263f30e81 100644
9774 --- a/include/linux/spinlock_api_smp.h
9775 +++ b/include/linux/spinlock_api_smp.h
9776 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
9777         return 0;
9778  }
9779
9780 -#include <linux/rwlock_api_smp.h>
9781 +#ifndef CONFIG_PREEMPT_RT_FULL
9782 +# include <linux/rwlock_api_smp.h>
9783 +#endif
9784
9785  #endif /* __LINUX_SPINLOCK_API_SMP_H */
9786 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
9787 new file mode 100644
9788 index 000000000000..3534cff3dd08
9789 --- /dev/null
9790 +++ b/include/linux/spinlock_rt.h
9791 @@ -0,0 +1,164 @@
9792 +#ifndef __LINUX_SPINLOCK_RT_H
9793 +#define __LINUX_SPINLOCK_RT_H
9794 +
9795 +#ifndef __LINUX_SPINLOCK_H
9796 +#error Do not include directly. Use spinlock.h
9797 +#endif
9798 +
9799 +#include <linux/bug.h>
9800 +
9801 +extern void
9802 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
9803 +
9804 +#define spin_lock_init(slock)                          \
9805 +do {                                                   \
9806 +       static struct lock_class_key __key;             \
9807 +                                                       \
9808 +       rt_mutex_init(&(slock)->lock);                  \
9809 +       __rt_spin_lock_init(slock, #slock, &__key);     \
9810 +} while (0)
9811 +
9812 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
9813 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
9814 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
9815 +
9816 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
9817 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
9818 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
9819 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
9820 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
9821 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
9822 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
9823 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
9824 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
9825 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
9826 +
9827 +/*
9828 + * lockdep-less calls, for derived types like rwlock:
9829 + * (for trylock they can use rt_mutex_trylock() directly.
9830 + */
9831 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
9832 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
9833 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
9834 +
9835 +#define spin_lock(lock)                        rt_spin_lock(lock)
9836 +
9837 +#define spin_lock_bh(lock)                     \
9838 +       do {                                    \
9839 +               local_bh_disable();             \
9840 +               rt_spin_lock(lock);             \
9841 +       } while (0)
9842 +
9843 +#define spin_lock_irq(lock)            spin_lock(lock)
9844 +
9845 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
9846 +
9847 +#define spin_trylock(lock)                     \
9848 +({                                             \
9849 +       int __locked;                           \
9850 +       __locked = spin_do_trylock(lock);       \
9851 +       __locked;                               \
9852 +})
9853 +
9854 +#ifdef CONFIG_LOCKDEP
9855 +# define spin_lock_nested(lock, subclass)              \
9856 +       do {                                            \
9857 +               rt_spin_lock_nested(lock, subclass);    \
9858 +       } while (0)
9859 +
9860 +#define spin_lock_bh_nested(lock, subclass)            \
9861 +       do {                                            \
9862 +               local_bh_disable();                     \
9863 +               rt_spin_lock_nested(lock, subclass);    \
9864 +       } while (0)
9865 +
9866 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9867 +       do {                                             \
9868 +               typecheck(unsigned long, flags);         \
9869 +               flags = 0;                               \
9870 +               rt_spin_lock_nested(lock, subclass);     \
9871 +       } while (0)
9872 +#else
9873 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
9874 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
9875 +
9876 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
9877 +       do {                                             \
9878 +               typecheck(unsigned long, flags);         \
9879 +               flags = 0;                               \
9880 +               spin_lock(lock);                         \
9881 +       } while (0)
9882 +#endif
9883 +
9884 +#define spin_lock_irqsave(lock, flags)                  \
9885 +       do {                                             \
9886 +               typecheck(unsigned long, flags);         \
9887 +               flags = 0;                               \
9888 +               spin_lock(lock);                         \
9889 +       } while (0)
9890 +
9891 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
9892 +{
9893 +       unsigned long flags = 0;
9894 +#ifdef CONFIG_TRACE_IRQFLAGS
9895 +       flags = rt_spin_lock_trace_flags(lock);
9896 +#else
9897 +       spin_lock(lock); /* lock_local */
9898 +#endif
9899 +       return flags;
9900 +}
9901 +
9902 +/* FIXME: we need rt_spin_lock_nest_lock */
9903 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
9904 +
9905 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
9906 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
9907 +
9908 +#define spin_unlock_bh(lock)                           \
9909 +       do {                                            \
9910 +               rt_spin_unlock(lock);                   \
9911 +               local_bh_enable();                      \
9912 +       } while (0)
9913 +
9914 +#define spin_unlock_irq(lock)          spin_unlock(lock)
9915 +
9916 +#define spin_unlock_irqrestore(lock, flags)            \
9917 +       do {                                            \
9918 +               typecheck(unsigned long, flags);        \
9919 +               (void) flags;                           \
9920 +               spin_unlock(lock);                      \
9921 +       } while (0)
9922 +
9923 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
9924 +#define spin_trylock_irq(lock) spin_trylock(lock)
9925 +
9926 +#define spin_trylock_irqsave(lock, flags)      \
9927 +       rt_spin_trylock_irqsave(lock, &(flags))
9928 +
9929 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
9930 +
9931 +#ifdef CONFIG_GENERIC_LOCKBREAK
9932 +# define spin_is_contended(lock)       ((lock)->break_lock)
9933 +#else
9934 +# define spin_is_contended(lock)       (((void)(lock), 0))
9935 +#endif
9936 +
9937 +static inline int spin_can_lock(spinlock_t *lock)
9938 +{
9939 +       return !rt_mutex_is_locked(&lock->lock);
9940 +}
9941 +
9942 +static inline int spin_is_locked(spinlock_t *lock)
9943 +{
9944 +       return rt_mutex_is_locked(&lock->lock);
9945 +}
9946 +
9947 +static inline void assert_spin_locked(spinlock_t *lock)
9948 +{
9949 +       BUG_ON(!spin_is_locked(lock));
9950 +}
9951 +
9952 +#define atomic_dec_and_lock(atomic, lock) \
9953 +       atomic_dec_and_spin_lock(atomic, lock)
9954 +
9955 +#endif
9956 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
9957 index 73548eb13a5d..10bac715ea96 100644
9958 --- a/include/linux/spinlock_types.h
9959 +++ b/include/linux/spinlock_types.h
9960 @@ -9,80 +9,15 @@
9961   * Released under the General Public License (GPL).
9962   */
9963
9964 -#if defined(CONFIG_SMP)
9965 -# include <asm/spinlock_types.h>
9966 +#include <linux/spinlock_types_raw.h>
9967 +
9968 +#ifndef CONFIG_PREEMPT_RT_FULL
9969 +# include <linux/spinlock_types_nort.h>
9970 +# include <linux/rwlock_types.h>
9971  #else
9972 -# include <linux/spinlock_types_up.h>
9973 +# include <linux/rtmutex.h>
9974 +# include <linux/spinlock_types_rt.h>
9975 +# include <linux/rwlock_types_rt.h>
9976  #endif
9977
9978 -#include <linux/lockdep.h>
9979 -
9980 -typedef struct raw_spinlock {
9981 -       arch_spinlock_t raw_lock;
9982 -#ifdef CONFIG_GENERIC_LOCKBREAK
9983 -       unsigned int break_lock;
9984 -#endif
9985 -#ifdef CONFIG_DEBUG_SPINLOCK
9986 -       unsigned int magic, owner_cpu;
9987 -       void *owner;
9988 -#endif
9989 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9990 -       struct lockdep_map dep_map;
9991 -#endif
9992 -} raw_spinlock_t;
9993 -
9994 -#define SPINLOCK_MAGIC         0xdead4ead
9995 -
9996 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
9997 -
9998 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
9999 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10000 -#else
10001 -# define SPIN_DEP_MAP_INIT(lockname)
10002 -#endif
10003 -
10004 -#ifdef CONFIG_DEBUG_SPINLOCK
10005 -# define SPIN_DEBUG_INIT(lockname)             \
10006 -       .magic = SPINLOCK_MAGIC,                \
10007 -       .owner_cpu = -1,                        \
10008 -       .owner = SPINLOCK_OWNER_INIT,
10009 -#else
10010 -# define SPIN_DEBUG_INIT(lockname)
10011 -#endif
10012 -
10013 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10014 -       {                                       \
10015 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10016 -       SPIN_DEBUG_INIT(lockname)               \
10017 -       SPIN_DEP_MAP_INIT(lockname) }
10018 -
10019 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10020 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10021 -
10022 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10023 -
10024 -typedef struct spinlock {
10025 -       union {
10026 -               struct raw_spinlock rlock;
10027 -
10028 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
10029 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10030 -               struct {
10031 -                       u8 __padding[LOCK_PADSIZE];
10032 -                       struct lockdep_map dep_map;
10033 -               };
10034 -#endif
10035 -       };
10036 -} spinlock_t;
10037 -
10038 -#define __SPIN_LOCK_INITIALIZER(lockname) \
10039 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10040 -
10041 -#define __SPIN_LOCK_UNLOCKED(lockname) \
10042 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10043 -
10044 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10045 -
10046 -#include <linux/rwlock_types.h>
10047 -
10048  #endif /* __LINUX_SPINLOCK_TYPES_H */
10049 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
10050 new file mode 100644
10051 index 000000000000..f1dac1fb1d6a
10052 --- /dev/null
10053 +++ b/include/linux/spinlock_types_nort.h
10054 @@ -0,0 +1,33 @@
10055 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
10056 +#define __LINUX_SPINLOCK_TYPES_NORT_H
10057 +
10058 +#ifndef __LINUX_SPINLOCK_TYPES_H
10059 +#error "Do not include directly. Include spinlock_types.h instead"
10060 +#endif
10061 +
10062 +/*
10063 + * The non RT version maps spinlocks to raw_spinlocks
10064 + */
10065 +typedef struct spinlock {
10066 +       union {
10067 +               struct raw_spinlock rlock;
10068 +
10069 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10070 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
10071 +               struct {
10072 +                       u8 __padding[LOCK_PADSIZE];
10073 +                       struct lockdep_map dep_map;
10074 +               };
10075 +#endif
10076 +       };
10077 +} spinlock_t;
10078 +
10079 +#define __SPIN_LOCK_INITIALIZER(lockname) \
10080 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
10081 +
10082 +#define __SPIN_LOCK_UNLOCKED(lockname) \
10083 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
10084 +
10085 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
10086 +
10087 +#endif
10088 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
10089 new file mode 100644
10090 index 000000000000..edffc4d53fc9
10091 --- /dev/null
10092 +++ b/include/linux/spinlock_types_raw.h
10093 @@ -0,0 +1,56 @@
10094 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
10095 +#define __LINUX_SPINLOCK_TYPES_RAW_H
10096 +
10097 +#if defined(CONFIG_SMP)
10098 +# include <asm/spinlock_types.h>
10099 +#else
10100 +# include <linux/spinlock_types_up.h>
10101 +#endif
10102 +
10103 +#include <linux/lockdep.h>
10104 +
10105 +typedef struct raw_spinlock {
10106 +       arch_spinlock_t raw_lock;
10107 +#ifdef CONFIG_GENERIC_LOCKBREAK
10108 +       unsigned int break_lock;
10109 +#endif
10110 +#ifdef CONFIG_DEBUG_SPINLOCK
10111 +       unsigned int magic, owner_cpu;
10112 +       void *owner;
10113 +#endif
10114 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10115 +       struct lockdep_map dep_map;
10116 +#endif
10117 +} raw_spinlock_t;
10118 +
10119 +#define SPINLOCK_MAGIC         0xdead4ead
10120 +
10121 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
10122 +
10123 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10124 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
10125 +#else
10126 +# define SPIN_DEP_MAP_INIT(lockname)
10127 +#endif
10128 +
10129 +#ifdef CONFIG_DEBUG_SPINLOCK
10130 +# define SPIN_DEBUG_INIT(lockname)             \
10131 +       .magic = SPINLOCK_MAGIC,                \
10132 +       .owner_cpu = -1,                        \
10133 +       .owner = SPINLOCK_OWNER_INIT,
10134 +#else
10135 +# define SPIN_DEBUG_INIT(lockname)
10136 +#endif
10137 +
10138 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
10139 +       {                                       \
10140 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
10141 +       SPIN_DEBUG_INIT(lockname)               \
10142 +       SPIN_DEP_MAP_INIT(lockname) }
10143 +
10144 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
10145 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
10146 +
10147 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
10148 +
10149 +#endif
10150 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
10151 new file mode 100644
10152 index 000000000000..3e3d8c5f7a9a
10153 --- /dev/null
10154 +++ b/include/linux/spinlock_types_rt.h
10155 @@ -0,0 +1,48 @@
10156 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
10157 +#define __LINUX_SPINLOCK_TYPES_RT_H
10158 +
10159 +#ifndef __LINUX_SPINLOCK_TYPES_H
10160 +#error "Do not include directly. Include spinlock_types.h instead"
10161 +#endif
10162 +
10163 +#include <linux/cache.h>
10164 +
10165 +/*
10166 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
10167 + */
10168 +typedef struct spinlock {
10169 +       struct rt_mutex         lock;
10170 +       unsigned int            break_lock;
10171 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10172 +       struct lockdep_map      dep_map;
10173 +#endif
10174 +} spinlock_t;
10175 +
10176 +#ifdef CONFIG_DEBUG_RT_MUTEXES
10177 +# define __RT_SPIN_INITIALIZER(name) \
10178 +       { \
10179 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
10180 +       .save_state = 1, \
10181 +       .file = __FILE__, \
10182 +       .line = __LINE__ , \
10183 +       }
10184 +#else
10185 +# define __RT_SPIN_INITIALIZER(name) \
10186 +       {                                                               \
10187 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
10188 +       .save_state = 1, \
10189 +       }
10190 +#endif
10191 +
10192 +/*
10193 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
10194 +*/
10195 +
10196 +#define __SPIN_LOCK_UNLOCKED(name)                     \
10197 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
10198 +         SPIN_DEP_MAP_INIT(name) }
10199 +
10200 +#define DEFINE_SPINLOCK(name) \
10201 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
10202 +
10203 +#endif
10204 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
10205 index dc8eb63c6568..e793d3a257da 100644
10206 --- a/include/linux/srcu.h
10207 +++ b/include/linux/srcu.h
10208 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
10209
10210  void process_srcu(struct work_struct *work);
10211
10212 -#define __SRCU_STRUCT_INIT(name)                                       \
10213 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
10214         {                                                               \
10215                 .completed = -300,                                      \
10216 -               .per_cpu_ref = &name##_srcu_array,                      \
10217 +               .per_cpu_ref = &pcpu_name,                              \
10218                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
10219                 .running = false,                                       \
10220                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
10221 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
10222   */
10223  #define __DEFINE_SRCU(name, is_static)                                 \
10224         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
10225 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
10226 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
10227  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
10228  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
10229
10230 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
10231 index d9718378a8be..e81e6dc7dcb1 100644
10232 --- a/include/linux/suspend.h
10233 +++ b/include/linux/suspend.h
10234 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
10235         void (*end)(void);
10236  };
10237
10238 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
10239 +extern bool pm_in_action;
10240 +#else
10241 +# define pm_in_action false
10242 +#endif
10243 +
10244  #ifdef CONFIG_SUSPEND
10245  /**
10246   * suspend_set_ops - set platform dependent suspend operations
10247 diff --git a/include/linux/swait.h b/include/linux/swait.h
10248 index c1f9c62a8a50..83f004a72320 100644
10249 --- a/include/linux/swait.h
10250 +++ b/include/linux/swait.h
10251 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
10252  extern void swake_up(struct swait_queue_head *q);
10253  extern void swake_up_all(struct swait_queue_head *q);
10254  extern void swake_up_locked(struct swait_queue_head *q);
10255 +extern void swake_up_all_locked(struct swait_queue_head *q);
10256
10257  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
10258  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
10259 diff --git a/include/linux/swap.h b/include/linux/swap.h
10260 index 55ff5593c193..52bf5477dc92 100644
10261 --- a/include/linux/swap.h
10262 +++ b/include/linux/swap.h
10263 @@ -11,6 +11,7 @@
10264  #include <linux/fs.h>
10265  #include <linux/atomic.h>
10266  #include <linux/page-flags.h>
10267 +#include <linux/locallock.h>
10268  #include <asm/page.h>
10269
10270  struct notifier_block;
10271 @@ -247,7 +248,8 @@ struct swap_info_struct {
10272  void *workingset_eviction(struct address_space *mapping, struct page *page);
10273  bool workingset_refault(void *shadow);
10274  void workingset_activation(struct page *page);
10275 -extern struct list_lru workingset_shadow_nodes;
10276 +extern struct list_lru __workingset_shadow_nodes;
10277 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
10278
10279  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
10280  {
10281 @@ -292,6 +294,7 @@ extern unsigned long nr_free_pagecache_pages(void);
10282
10283
10284  /* linux/mm/swap.c */
10285 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
10286  extern void lru_cache_add(struct page *);
10287  extern void lru_cache_add_anon(struct page *page);
10288  extern void lru_cache_add_file(struct page *page);
10289 diff --git a/include/linux/swork.h b/include/linux/swork.h
10290 new file mode 100644
10291 index 000000000000..f175fa9a6016
10292 --- /dev/null
10293 +++ b/include/linux/swork.h
10294 @@ -0,0 +1,24 @@
10295 +#ifndef _LINUX_SWORK_H
10296 +#define _LINUX_SWORK_H
10297 +
10298 +#include <linux/list.h>
10299 +
10300 +struct swork_event {
10301 +       struct list_head item;
10302 +       unsigned long flags;
10303 +       void (*func)(struct swork_event *);
10304 +};
10305 +
10306 +static inline void INIT_SWORK(struct swork_event *event,
10307 +                             void (*func)(struct swork_event *))
10308 +{
10309 +       event->flags = 0;
10310 +       event->func = func;
10311 +}
10312 +
10313 +bool swork_queue(struct swork_event *sev);
10314 +
10315 +int swork_get(void);
10316 +void swork_put(void);
10317 +
10318 +#endif /* _LINUX_SWORK_H */
10319 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
10320 index 2873baf5372a..eb1a108f17ca 100644
10321 --- a/include/linux/thread_info.h
10322 +++ b/include/linux/thread_info.h
10323 @@ -107,7 +107,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
10324  #define test_thread_flag(flag) \
10325         test_ti_thread_flag(current_thread_info(), flag)
10326
10327 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
10328 +#ifdef CONFIG_PREEMPT_LAZY
10329 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
10330 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
10331 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
10332 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
10333 +
10334 +#else
10335 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
10336 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
10337 +#define tif_need_resched_lazy()        0
10338 +#endif
10339
10340  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
10341  static inline int arch_within_stack_frames(const void * const stack,
10342 diff --git a/include/linux/timer.h b/include/linux/timer.h
10343 index 51d601f192d4..83cea629efe1 100644
10344 --- a/include/linux/timer.h
10345 +++ b/include/linux/timer.h
10346 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
10347
10348  extern int try_to_del_timer_sync(struct timer_list *timer);
10349
10350 -#ifdef CONFIG_SMP
10351 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
10352    extern int del_timer_sync(struct timer_list *timer);
10353  #else
10354  # define del_timer_sync(t)             del_timer(t)
10355 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
10356 index be007610ceb0..15154b13a53b 100644
10357 --- a/include/linux/trace_events.h
10358 +++ b/include/linux/trace_events.h
10359 @@ -56,6 +56,9 @@ struct trace_entry {
10360         unsigned char           flags;
10361         unsigned char           preempt_count;
10362         int                     pid;
10363 +       unsigned short          migrate_disable;
10364 +       unsigned short          padding;
10365 +       unsigned char           preempt_lazy_count;
10366  };
10367
10368  #define TRACE_EVENT_TYPE_MAX                                           \
10369 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
10370 index f30c187ed785..83bf0f798426 100644
10371 --- a/include/linux/uaccess.h
10372 +++ b/include/linux/uaccess.h
10373 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
10374   */
10375  static inline void pagefault_disable(void)
10376  {
10377 +       migrate_disable();
10378         pagefault_disabled_inc();
10379         /*
10380          * make sure to have issued the store before a pagefault
10381 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
10382          */
10383         barrier();
10384         pagefault_disabled_dec();
10385 +       migrate_enable();
10386  }
10387
10388  /*
10389 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
10390 index 4a29c75b146e..0a294e950df8 100644
10391 --- a/include/linux/uprobes.h
10392 +++ b/include/linux/uprobes.h
10393 @@ -27,6 +27,7 @@
10394  #include <linux/errno.h>
10395  #include <linux/rbtree.h>
10396  #include <linux/types.h>
10397 +#include <linux/wait.h>
10398
10399  struct vm_area_struct;
10400  struct mm_struct;
10401 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
10402 index 613771909b6e..e28c5a43229d 100644
10403 --- a/include/linux/vmstat.h
10404 +++ b/include/linux/vmstat.h
10405 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
10406   */
10407  static inline void __count_vm_event(enum vm_event_item item)
10408  {
10409 +       preempt_disable_rt();
10410         raw_cpu_inc(vm_event_states.event[item]);
10411 +       preempt_enable_rt();
10412  }
10413
10414  static inline void count_vm_event(enum vm_event_item item)
10415 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
10416
10417  static inline void __count_vm_events(enum vm_event_item item, long delta)
10418  {
10419 +       preempt_disable_rt();
10420         raw_cpu_add(vm_event_states.event[item], delta);
10421 +       preempt_enable_rt();
10422  }
10423
10424  static inline void count_vm_events(enum vm_event_item item, long delta)
10425 diff --git a/include/linux/wait.h b/include/linux/wait.h
10426 index 2408e8d5c05c..db50d6609195 100644
10427 --- a/include/linux/wait.h
10428 +++ b/include/linux/wait.h
10429 @@ -8,6 +8,7 @@
10430  #include <linux/spinlock.h>
10431  #include <asm/current.h>
10432  #include <uapi/linux/wait.h>
10433 +#include <linux/atomic.h>
10434
10435  typedef struct __wait_queue wait_queue_t;
10436  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
10437 diff --git a/include/net/dst.h b/include/net/dst.h
10438 index 6835d224d47b..55a5a9698f14 100644
10439 --- a/include/net/dst.h
10440 +++ b/include/net/dst.h
10441 @@ -446,7 +446,7 @@ static inline void dst_confirm(struct dst_entry *dst)
10442  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
10443                                    struct sk_buff *skb)
10444  {
10445 -       const struct hh_cache *hh;
10446 +       struct hh_cache *hh;
10447
10448         if (dst->pending_confirm) {
10449                 unsigned long now = jiffies;
10450 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
10451 index 231e121cc7d9..d125222b979d 100644
10452 --- a/include/net/gen_stats.h
10453 +++ b/include/net/gen_stats.h
10454 @@ -5,6 +5,7 @@
10455  #include <linux/socket.h>
10456  #include <linux/rtnetlink.h>
10457  #include <linux/pkt_sched.h>
10458 +#include <net/net_seq_lock.h>
10459
10460  struct gnet_stats_basic_cpu {
10461         struct gnet_stats_basic_packed bstats;
10462 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
10463                                  spinlock_t *lock, struct gnet_dump *d,
10464                                  int padattr);
10465
10466 -int gnet_stats_copy_basic(const seqcount_t *running,
10467 +int gnet_stats_copy_basic(net_seqlock_t *running,
10468                           struct gnet_dump *d,
10469                           struct gnet_stats_basic_cpu __percpu *cpu,
10470                           struct gnet_stats_basic_packed *b);
10471 -void __gnet_stats_copy_basic(const seqcount_t *running,
10472 +void __gnet_stats_copy_basic(net_seqlock_t *running,
10473                              struct gnet_stats_basic_packed *bstats,
10474                              struct gnet_stats_basic_cpu __percpu *cpu,
10475                              struct gnet_stats_basic_packed *b);
10476 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
10477                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10478                       struct gnet_stats_rate_est64 *rate_est,
10479                       spinlock_t *stats_lock,
10480 -                     seqcount_t *running, struct nlattr *opt);
10481 +                     net_seqlock_t *running, struct nlattr *opt);
10482  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
10483                         struct gnet_stats_rate_est64 *rate_est);
10484  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
10485                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
10486                           struct gnet_stats_rate_est64 *rate_est,
10487                           spinlock_t *stats_lock,
10488 -                         seqcount_t *running, struct nlattr *opt);
10489 +                         net_seqlock_t *running, struct nlattr *opt);
10490  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
10491                           const struct gnet_stats_rate_est64 *rate_est);
10492  #endif
10493 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
10494 index 8b683841e574..bf656008f6e7 100644
10495 --- a/include/net/neighbour.h
10496 +++ b/include/net/neighbour.h
10497 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
10498  }
10499  #endif
10500
10501 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
10502 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
10503  {
10504         unsigned int seq;
10505         int hh_len;
10506 @@ -501,7 +501,7 @@ struct neighbour_cb {
10507
10508  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
10509
10510 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
10511 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
10512                                      const struct net_device *dev)
10513  {
10514         unsigned int seq;
10515 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
10516 new file mode 100644
10517 index 000000000000..a7034298a82a
10518 --- /dev/null
10519 +++ b/include/net/net_seq_lock.h
10520 @@ -0,0 +1,15 @@
10521 +#ifndef __NET_NET_SEQ_LOCK_H__
10522 +#define __NET_NET_SEQ_LOCK_H__
10523 +
10524 +#ifdef CONFIG_PREEMPT_RT_BASE
10525 +# define net_seqlock_t                 seqlock_t
10526 +# define net_seq_begin(__r)            read_seqbegin(__r)
10527 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
10528 +
10529 +#else
10530 +# define net_seqlock_t                 seqcount_t
10531 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
10532 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
10533 +#endif
10534 +
10535 +#endif
10536 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
10537 index 7adf4386ac8f..d3fd5c357268 100644
10538 --- a/include/net/netns/ipv4.h
10539 +++ b/include/net/netns/ipv4.h
10540 @@ -69,6 +69,7 @@ struct netns_ipv4 {
10541
10542         int sysctl_icmp_echo_ignore_all;
10543         int sysctl_icmp_echo_ignore_broadcasts;
10544 +       int sysctl_icmp_echo_sysrq;
10545         int sysctl_icmp_ignore_bogus_error_responses;
10546         int sysctl_icmp_ratelimit;
10547         int sysctl_icmp_ratemask;
10548 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
10549 index e6aa0a249672..b57736f2a8a3 100644
10550 --- a/include/net/sch_generic.h
10551 +++ b/include/net/sch_generic.h
10552 @@ -10,6 +10,7 @@
10553  #include <linux/dynamic_queue_limits.h>
10554  #include <net/gen_stats.h>
10555  #include <net/rtnetlink.h>
10556 +#include <net/net_seq_lock.h>
10557
10558  struct Qdisc_ops;
10559  struct qdisc_walker;
10560 @@ -86,7 +87,7 @@ struct Qdisc {
10561         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
10562         struct qdisc_skb_head   q;
10563         struct gnet_stats_basic_packed bstats;
10564 -       seqcount_t              running;
10565 +       net_seqlock_t           running;
10566         struct gnet_stats_queue qstats;
10567         unsigned long           state;
10568         struct Qdisc            *next_sched;
10569 @@ -98,13 +99,22 @@ struct Qdisc {
10570         spinlock_t              busylock ____cacheline_aligned_in_smp;
10571  };
10572
10573 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
10574 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
10575  {
10576 +#ifdef CONFIG_PREEMPT_RT_BASE
10577 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
10578 +#else
10579         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
10580 +#endif
10581  }
10582
10583  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10584  {
10585 +#ifdef CONFIG_PREEMPT_RT_BASE
10586 +       if (try_write_seqlock(&qdisc->running))
10587 +               return true;
10588 +       return false;
10589 +#else
10590         if (qdisc_is_running(qdisc))
10591                 return false;
10592         /* Variant of write_seqcount_begin() telling lockdep a trylock
10593 @@ -113,11 +123,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
10594         raw_write_seqcount_begin(&qdisc->running);
10595         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
10596         return true;
10597 +#endif
10598  }
10599
10600  static inline void qdisc_run_end(struct Qdisc *qdisc)
10601  {
10602 +#ifdef CONFIG_PREEMPT_RT_BASE
10603 +       write_sequnlock(&qdisc->running);
10604 +#else
10605         write_seqcount_end(&qdisc->running);
10606 +#endif
10607  }
10608
10609  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
10610 @@ -308,7 +323,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
10611         return qdisc_lock(root);
10612  }
10613
10614 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10615 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
10616  {
10617         struct Qdisc *root = qdisc_root_sleeping(qdisc);
10618
10619 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
10620 new file mode 100644
10621 index 000000000000..f7710de1b1f3
10622 --- /dev/null
10623 +++ b/include/trace/events/hist.h
10624 @@ -0,0 +1,73 @@
10625 +#undef TRACE_SYSTEM
10626 +#define TRACE_SYSTEM hist
10627 +
10628 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
10629 +#define _TRACE_HIST_H
10630 +
10631 +#include "latency_hist.h"
10632 +#include <linux/tracepoint.h>
10633 +
10634 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
10635 +#define trace_preemptirqsoff_hist(a, b)
10636 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
10637 +#else
10638 +TRACE_EVENT(preemptirqsoff_hist,
10639 +
10640 +       TP_PROTO(int reason, int starthist),
10641 +
10642 +       TP_ARGS(reason, starthist),
10643 +
10644 +       TP_STRUCT__entry(
10645 +               __field(int,    reason)
10646 +               __field(int,    starthist)
10647 +       ),
10648 +
10649 +       TP_fast_assign(
10650 +               __entry->reason         = reason;
10651 +               __entry->starthist      = starthist;
10652 +       ),
10653 +
10654 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
10655 +                 __entry->starthist ? "start" : "stop")
10656 +);
10657 +#endif
10658 +
10659 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
10660 +#define trace_hrtimer_interrupt(a, b, c, d)
10661 +#else
10662 +TRACE_EVENT(hrtimer_interrupt,
10663 +
10664 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
10665 +               struct task_struct *task),
10666 +
10667 +       TP_ARGS(cpu, offset, curr, task),
10668 +
10669 +       TP_STRUCT__entry(
10670 +               __field(int,            cpu)
10671 +               __field(long long,      offset)
10672 +               __array(char,           ccomm,  TASK_COMM_LEN)
10673 +               __field(int,            cprio)
10674 +               __array(char,           tcomm,  TASK_COMM_LEN)
10675 +               __field(int,            tprio)
10676 +       ),
10677 +
10678 +       TP_fast_assign(
10679 +               __entry->cpu    = cpu;
10680 +               __entry->offset = offset;
10681 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
10682 +               __entry->cprio  = curr->prio;
10683 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
10684 +                       task != NULL ? TASK_COMM_LEN : 7);
10685 +               __entry->tprio  = task != NULL ? task->prio : -1;
10686 +       ),
10687 +
10688 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
10689 +               __entry->cpu, __entry->offset, __entry->ccomm,
10690 +               __entry->cprio, __entry->tcomm, __entry->tprio)
10691 +);
10692 +#endif
10693 +
10694 +#endif /* _TRACE_HIST_H */
10695 +
10696 +/* This part must be outside protection */
10697 +#include <trace/define_trace.h>
10698 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
10699 new file mode 100644
10700 index 000000000000..d3f2fbd560b1
10701 --- /dev/null
10702 +++ b/include/trace/events/latency_hist.h
10703 @@ -0,0 +1,29 @@
10704 +#ifndef _LATENCY_HIST_H
10705 +#define _LATENCY_HIST_H
10706 +
10707 +enum hist_action {
10708 +       IRQS_ON,
10709 +       PREEMPT_ON,
10710 +       TRACE_STOP,
10711 +       IRQS_OFF,
10712 +       PREEMPT_OFF,
10713 +       TRACE_START,
10714 +};
10715 +
10716 +static char *actions[] = {
10717 +       "IRQS_ON",
10718 +       "PREEMPT_ON",
10719 +       "TRACE_STOP",
10720 +       "IRQS_OFF",
10721 +       "PREEMPT_OFF",
10722 +       "TRACE_START",
10723 +};
10724 +
10725 +static inline char *getaction(int action)
10726 +{
10727 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
10728 +               return actions[action];
10729 +       return "unknown";
10730 +}
10731 +
10732 +#endif /* _LATENCY_HIST_H */
10733 diff --git a/init/Kconfig b/init/Kconfig
10734 index 34407f15e6d3..2ce33a32e65d 100644
10735 --- a/init/Kconfig
10736 +++ b/init/Kconfig
10737 @@ -506,7 +506,7 @@ config TINY_RCU
10738
10739  config RCU_EXPERT
10740         bool "Make expert-level adjustments to RCU configuration"
10741 -       default n
10742 +       default y if PREEMPT_RT_FULL
10743         help
10744           This option needs to be enabled if you wish to make
10745           expert-level adjustments to RCU configuration.  By default,
10746 @@ -623,7 +623,7 @@ config RCU_FANOUT_LEAF
10747
10748  config RCU_FAST_NO_HZ
10749         bool "Accelerate last non-dyntick-idle CPU's grace periods"
10750 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
10751 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
10752         default n
10753         help
10754           This option permits CPUs to enter dynticks-idle state even if
10755 @@ -650,7 +650,7 @@ config TREE_RCU_TRACE
10756  config RCU_BOOST
10757         bool "Enable RCU priority boosting"
10758         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
10759 -       default n
10760 +       default y if PREEMPT_RT_FULL
10761         help
10762           This option boosts the priority of preempted RCU readers that
10763           block the current preemptible RCU grace period for too long.
10764 @@ -781,19 +781,6 @@ config RCU_NOCB_CPU_ALL
10765
10766  endchoice
10767
10768 -config RCU_EXPEDITE_BOOT
10769 -       bool
10770 -       default n
10771 -       help
10772 -         This option enables expedited grace periods at boot time,
10773 -         as if rcu_expedite_gp() had been invoked early in boot.
10774 -         The corresponding rcu_unexpedite_gp() is invoked from
10775 -         rcu_end_inkernel_boot(), which is intended to be invoked
10776 -         at the end of the kernel-only boot sequence, just before
10777 -         init is exec'ed.
10778 -
10779 -         Accept the default if unsure.
10780 -
10781  endmenu # "RCU Subsystem"
10782
10783  config BUILD_BIN2C
10784 @@ -1064,6 +1051,7 @@ config CFS_BANDWIDTH
10785  config RT_GROUP_SCHED
10786         bool "Group scheduling for SCHED_RR/FIFO"
10787         depends on CGROUP_SCHED
10788 +       depends on !PREEMPT_RT_FULL
10789         default n
10790         help
10791           This feature lets you explicitly allocate real CPU bandwidth
10792 @@ -1772,6 +1760,7 @@ choice
10793
10794  config SLAB
10795         bool "SLAB"
10796 +       depends on !PREEMPT_RT_FULL
10797         select HAVE_HARDENED_USERCOPY_ALLOCATOR
10798         help
10799           The regular slab allocator that is established and known to work
10800 @@ -1792,6 +1781,7 @@ config SLUB
10801  config SLOB
10802         depends on EXPERT
10803         bool "SLOB (Simple Allocator)"
10804 +       depends on !PREEMPT_RT_FULL
10805         help
10806            SLOB replaces the stock allocator with a drastically simpler
10807            allocator. SLOB is generally more space efficient but
10808 @@ -1810,7 +1800,7 @@ config SLAB_FREELIST_RANDOM
10809
10810  config SLUB_CPU_PARTIAL
10811         default y
10812 -       depends on SLUB && SMP
10813 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
10814         bool "SLUB per cpu partial cache"
10815         help
10816           Per cpu partial caches accellerate objects allocation and freeing
10817 diff --git a/init/Makefile b/init/Makefile
10818 index c4fb45525d08..821190dfaa75 100644
10819 --- a/init/Makefile
10820 +++ b/init/Makefile
10821 @@ -35,4 +35,4 @@ $(obj)/version.o: include/generated/compile.h
10822  include/generated/compile.h: FORCE
10823         @$($(quiet)chk_compile.h)
10824         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
10825 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
10826 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
10827 diff --git a/init/main.c b/init/main.c
10828 index 2858be732f6d..3c97c3c91d88 100644
10829 --- a/init/main.c
10830 +++ b/init/main.c
10831 @@ -507,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
10832         setup_command_line(command_line);
10833         setup_nr_cpu_ids();
10834         setup_per_cpu_areas();
10835 +       softirq_early_init();
10836         boot_cpu_state_init();
10837         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
10838
10839 diff --git a/ipc/sem.c b/ipc/sem.c
10840 index 10b94bc59d4a..b8360eaacc7a 100644
10841 --- a/ipc/sem.c
10842 +++ b/ipc/sem.c
10843 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
10844  static void wake_up_sem_queue_prepare(struct list_head *pt,
10845                                 struct sem_queue *q, int error)
10846  {
10847 +#ifdef CONFIG_PREEMPT_RT_BASE
10848 +       struct task_struct *p = q->sleeper;
10849 +       get_task_struct(p);
10850 +       q->status = error;
10851 +       wake_up_process(p);
10852 +       put_task_struct(p);
10853 +#else
10854         if (list_empty(pt)) {
10855                 /*
10856                  * Hold preempt off so that we don't get preempted and have the
10857 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
10858         q->pid = error;
10859
10860         list_add_tail(&q->list, pt);
10861 +#endif
10862  }
10863
10864  /**
10865 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
10866   */
10867  static void wake_up_sem_queue_do(struct list_head *pt)
10868  {
10869 +#ifndef CONFIG_PREEMPT_RT_BASE
10870         struct sem_queue *q, *t;
10871         int did_something;
10872
10873 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
10874         }
10875         if (did_something)
10876                 preempt_enable();
10877 +#endif
10878  }
10879
10880  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
10881 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
10882 index ebdb0043203a..b9e6aa7e5aa6 100644
10883 --- a/kernel/Kconfig.locks
10884 +++ b/kernel/Kconfig.locks
10885 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
10886
10887  config MUTEX_SPIN_ON_OWNER
10888         def_bool y
10889 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
10890 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10891
10892  config RWSEM_SPIN_ON_OWNER
10893         def_bool y
10894 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
10895 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
10896
10897  config LOCK_SPIN_ON_OWNER
10898         def_bool y
10899 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
10900 index 3f9c97419f02..11dbe26a8279 100644
10901 --- a/kernel/Kconfig.preempt
10902 +++ b/kernel/Kconfig.preempt
10903 @@ -1,3 +1,16 @@
10904 +config PREEMPT
10905 +       bool
10906 +       select PREEMPT_COUNT
10907 +
10908 +config PREEMPT_RT_BASE
10909 +       bool
10910 +       select PREEMPT
10911 +
10912 +config HAVE_PREEMPT_LAZY
10913 +       bool
10914 +
10915 +config PREEMPT_LAZY
10916 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
10917
10918  choice
10919         prompt "Preemption Model"
10920 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
10921
10922           Select this if you are building a kernel for a desktop system.
10923
10924 -config PREEMPT
10925 +config PREEMPT__LL
10926         bool "Preemptible Kernel (Low-Latency Desktop)"
10927 -       select PREEMPT_COUNT
10928 +       select PREEMPT
10929         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
10930         help
10931           This option reduces the latency of the kernel by making
10932 @@ -52,6 +65,22 @@ config PREEMPT
10933           embedded system with latency requirements in the milliseconds
10934           range.
10935
10936 +config PREEMPT_RTB
10937 +       bool "Preemptible Kernel (Basic RT)"
10938 +       select PREEMPT_RT_BASE
10939 +       help
10940 +         This option is basically the same as (Low-Latency Desktop) but
10941 +         enables changes which are preliminary for the full preemptible
10942 +         RT kernel.
10943 +
10944 +config PREEMPT_RT_FULL
10945 +       bool "Fully Preemptible Kernel (RT)"
10946 +       depends on IRQ_FORCED_THREADING
10947 +       select PREEMPT_RT_BASE
10948 +       select PREEMPT_RCU
10949 +       help
10950 +         All and everything
10951 +
10952  endchoice
10953
10954  config PREEMPT_COUNT
10955 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
10956 index 85bc9beb046d..3b8da75ba2e0 100644
10957 --- a/kernel/cgroup.c
10958 +++ b/kernel/cgroup.c
10959 @@ -5040,10 +5040,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
10960         queue_work(cgroup_destroy_wq, &css->destroy_work);
10961  }
10962
10963 -static void css_release_work_fn(struct work_struct *work)
10964 +static void css_release_work_fn(struct swork_event *sev)
10965  {
10966         struct cgroup_subsys_state *css =
10967 -               container_of(work, struct cgroup_subsys_state, destroy_work);
10968 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
10969         struct cgroup_subsys *ss = css->ss;
10970         struct cgroup *cgrp = css->cgroup;
10971
10972 @@ -5086,8 +5086,8 @@ static void css_release(struct percpu_ref *ref)
10973         struct cgroup_subsys_state *css =
10974                 container_of(ref, struct cgroup_subsys_state, refcnt);
10975
10976 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
10977 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
10978 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
10979 +       swork_queue(&css->destroy_swork);
10980  }
10981
10982  static void init_and_link_css(struct cgroup_subsys_state *css,
10983 @@ -5742,6 +5742,7 @@ static int __init cgroup_wq_init(void)
10984          */
10985         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
10986         BUG_ON(!cgroup_destroy_wq);
10987 +       BUG_ON(swork_get());
10988
10989         /*
10990          * Used to destroy pidlists and separate to serve as flush domain.
10991 diff --git a/kernel/cpu.c b/kernel/cpu.c
10992 index 217fd2e7f435..69444f1bc924 100644
10993 --- a/kernel/cpu.c
10994 +++ b/kernel/cpu.c
10995 @@ -239,6 +239,289 @@ static struct {
10996  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
10997  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
10998
10999 +/**
11000 + * hotplug_pcp - per cpu hotplug descriptor
11001 + * @unplug:    set when pin_current_cpu() needs to sync tasks
11002 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
11003 + * @refcount:  counter of tasks in pinned sections
11004 + * @grab_lock: set when the tasks entering pinned sections should wait
11005 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
11006 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
11007 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
11008 + *
11009 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
11010 + * is used as a flag and still exists after @sync_tsk has exited and
11011 + * @sync_tsk set to NULL.
11012 + */
11013 +struct hotplug_pcp {
11014 +       struct task_struct *unplug;
11015 +       struct task_struct *sync_tsk;
11016 +       int refcount;
11017 +       int grab_lock;
11018 +       struct completion synced;
11019 +       struct completion unplug_wait;
11020 +#ifdef CONFIG_PREEMPT_RT_FULL
11021 +       /*
11022 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
11023 +        * the task, otherwise the mutex will cause the task to fail
11024 +        * to sleep when required. (Because it's called from migrate_disable())
11025 +        *
11026 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
11027 +        * state.
11028 +        */
11029 +       spinlock_t lock;
11030 +#else
11031 +       struct mutex mutex;
11032 +#endif
11033 +       int mutex_init;
11034 +};
11035 +
11036 +#ifdef CONFIG_PREEMPT_RT_FULL
11037 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
11038 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
11039 +#else
11040 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
11041 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
11042 +#endif
11043 +
11044 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
11045 +
11046 +/**
11047 + * pin_current_cpu - Prevent the current cpu from being unplugged
11048 + *
11049 + * Lightweight version of get_online_cpus() to prevent cpu from being
11050 + * unplugged when code runs in a migration disabled region.
11051 + *
11052 + * Must be called with preemption disabled (preempt_count = 1)!
11053 + */
11054 +void pin_current_cpu(void)
11055 +{
11056 +       struct hotplug_pcp *hp;
11057 +       int force = 0;
11058 +
11059 +retry:
11060 +       hp = this_cpu_ptr(&hotplug_pcp);
11061 +
11062 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
11063 +           hp->unplug == current) {
11064 +               hp->refcount++;
11065 +               return;
11066 +       }
11067 +       if (hp->grab_lock) {
11068 +               preempt_enable();
11069 +               hotplug_lock(hp);
11070 +               hotplug_unlock(hp);
11071 +       } else {
11072 +               preempt_enable();
11073 +               /*
11074 +                * Try to push this task off of this CPU.
11075 +                */
11076 +               if (!migrate_me()) {
11077 +                       preempt_disable();
11078 +                       hp = this_cpu_ptr(&hotplug_pcp);
11079 +                       if (!hp->grab_lock) {
11080 +                               /*
11081 +                                * Just let it continue it's already pinned
11082 +                                * or about to sleep.
11083 +                                */
11084 +                               force = 1;
11085 +                               goto retry;
11086 +                       }
11087 +                       preempt_enable();
11088 +               }
11089 +       }
11090 +       preempt_disable();
11091 +       goto retry;
11092 +}
11093 +
11094 +/**
11095 + * unpin_current_cpu - Allow unplug of current cpu
11096 + *
11097 + * Must be called with preemption or interrupts disabled!
11098 + */
11099 +void unpin_current_cpu(void)
11100 +{
11101 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
11102 +
11103 +       WARN_ON(hp->refcount <= 0);
11104 +
11105 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
11106 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
11107 +               wake_up_process(hp->unplug);
11108 +}
11109 +
11110 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
11111 +{
11112 +       set_current_state(TASK_UNINTERRUPTIBLE);
11113 +       while (hp->refcount) {
11114 +               schedule_preempt_disabled();
11115 +               set_current_state(TASK_UNINTERRUPTIBLE);
11116 +       }
11117 +}
11118 +
11119 +static int sync_unplug_thread(void *data)
11120 +{
11121 +       struct hotplug_pcp *hp = data;
11122 +
11123 +       wait_for_completion(&hp->unplug_wait);
11124 +       preempt_disable();
11125 +       hp->unplug = current;
11126 +       wait_for_pinned_cpus(hp);
11127 +
11128 +       /*
11129 +        * This thread will synchronize the cpu_down() with threads
11130 +        * that have pinned the CPU. When the pinned CPU count reaches
11131 +        * zero, we inform the cpu_down code to continue to the next step.
11132 +        */
11133 +       set_current_state(TASK_UNINTERRUPTIBLE);
11134 +       preempt_enable();
11135 +       complete(&hp->synced);
11136 +
11137 +       /*
11138 +        * If all succeeds, the next step will need tasks to wait till
11139 +        * the CPU is offline before continuing. To do this, the grab_lock
11140 +        * is set and tasks going into pin_current_cpu() will block on the
11141 +        * mutex. But we still need to wait for those that are already in
11142 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
11143 +        * will kick this thread out.
11144 +        */
11145 +       while (!hp->grab_lock && !kthread_should_stop()) {
11146 +               schedule();
11147 +               set_current_state(TASK_UNINTERRUPTIBLE);
11148 +       }
11149 +
11150 +       /* Make sure grab_lock is seen before we see a stale completion */
11151 +       smp_mb();
11152 +
11153 +       /*
11154 +        * Now just before cpu_down() enters stop machine, we need to make
11155 +        * sure all tasks that are in pinned CPU sections are out, and new
11156 +        * tasks will now grab the lock, keeping them from entering pinned
11157 +        * CPU sections.
11158 +        */
11159 +       if (!kthread_should_stop()) {
11160 +               preempt_disable();
11161 +               wait_for_pinned_cpus(hp);
11162 +               preempt_enable();
11163 +               complete(&hp->synced);
11164 +       }
11165 +
11166 +       set_current_state(TASK_UNINTERRUPTIBLE);
11167 +       while (!kthread_should_stop()) {
11168 +               schedule();
11169 +               set_current_state(TASK_UNINTERRUPTIBLE);
11170 +       }
11171 +       set_current_state(TASK_RUNNING);
11172 +
11173 +       /*
11174 +        * Force this thread off this CPU as it's going down and
11175 +        * we don't want any more work on this CPU.
11176 +        */
11177 +       current->flags &= ~PF_NO_SETAFFINITY;
11178 +       set_cpus_allowed_ptr(current, cpu_present_mask);
11179 +       migrate_me();
11180 +       return 0;
11181 +}
11182 +
11183 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
11184 +{
11185 +       wake_up_process(hp->sync_tsk);
11186 +       wait_for_completion(&hp->synced);
11187 +}
11188 +
11189 +static void __cpu_unplug_wait(unsigned int cpu)
11190 +{
11191 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11192 +
11193 +       complete(&hp->unplug_wait);
11194 +       wait_for_completion(&hp->synced);
11195 +}
11196 +
11197 +/*
11198 + * Start the sync_unplug_thread on the target cpu and wait for it to
11199 + * complete.
11200 + */
11201 +static int cpu_unplug_begin(unsigned int cpu)
11202 +{
11203 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11204 +       int err;
11205 +
11206 +       /* Protected by cpu_hotplug.lock */
11207 +       if (!hp->mutex_init) {
11208 +#ifdef CONFIG_PREEMPT_RT_FULL
11209 +               spin_lock_init(&hp->lock);
11210 +#else
11211 +               mutex_init(&hp->mutex);
11212 +#endif
11213 +               hp->mutex_init = 1;
11214 +       }
11215 +
11216 +       /* Inform the scheduler to migrate tasks off this CPU */
11217 +       tell_sched_cpu_down_begin(cpu);
11218 +
11219 +       init_completion(&hp->synced);
11220 +       init_completion(&hp->unplug_wait);
11221 +
11222 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
11223 +       if (IS_ERR(hp->sync_tsk)) {
11224 +               err = PTR_ERR(hp->sync_tsk);
11225 +               hp->sync_tsk = NULL;
11226 +               return err;
11227 +       }
11228 +       kthread_bind(hp->sync_tsk, cpu);
11229 +
11230 +       /*
11231 +        * Wait for tasks to get out of the pinned sections,
11232 +        * it's still OK if new tasks enter. Some CPU notifiers will
11233 +        * wait for tasks that are going to enter these sections and
11234 +        * we must not have them block.
11235 +        */
11236 +       wake_up_process(hp->sync_tsk);
11237 +       return 0;
11238 +}
11239 +
11240 +static void cpu_unplug_sync(unsigned int cpu)
11241 +{
11242 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11243 +
11244 +       init_completion(&hp->synced);
11245 +       /* The completion needs to be initialzied before setting grab_lock */
11246 +       smp_wmb();
11247 +
11248 +       /* Grab the mutex before setting grab_lock */
11249 +       hotplug_lock(hp);
11250 +       hp->grab_lock = 1;
11251 +
11252 +       /*
11253 +        * The CPU notifiers have been completed.
11254 +        * Wait for tasks to get out of pinned CPU sections and have new
11255 +        * tasks block until the CPU is completely down.
11256 +        */
11257 +       __cpu_unplug_sync(hp);
11258 +
11259 +       /* All done with the sync thread */
11260 +       kthread_stop(hp->sync_tsk);
11261 +       hp->sync_tsk = NULL;
11262 +}
11263 +
11264 +static void cpu_unplug_done(unsigned int cpu)
11265 +{
11266 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
11267 +
11268 +       hp->unplug = NULL;
11269 +       /* Let all tasks know cpu unplug is finished before cleaning up */
11270 +       smp_wmb();
11271 +
11272 +       if (hp->sync_tsk)
11273 +               kthread_stop(hp->sync_tsk);
11274 +
11275 +       if (hp->grab_lock) {
11276 +               hotplug_unlock(hp);
11277 +               /* protected by cpu_hotplug.lock */
11278 +               hp->grab_lock = 0;
11279 +       }
11280 +       tell_sched_cpu_down_done(cpu);
11281 +}
11282
11283  void get_online_cpus(void)
11284  {
11285 @@ -789,10 +1072,14 @@ static int takedown_cpu(unsigned int cpu)
11286         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11287         int err;
11288
11289 +       __cpu_unplug_wait(cpu);
11290         /* Park the smpboot threads */
11291         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
11292         smpboot_park_threads(cpu);
11293
11294 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
11295 +       cpu_unplug_sync(cpu);
11296 +
11297         /*
11298          * Prevent irq alloc/free while the dying cpu reorganizes the
11299          * interrupt affinities.
11300 @@ -877,6 +1164,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11301         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
11302         int prev_state, ret = 0;
11303         bool hasdied = false;
11304 +       int mycpu;
11305 +       cpumask_var_t cpumask;
11306 +       cpumask_var_t cpumask_org;
11307
11308         if (num_online_cpus() == 1)
11309                 return -EBUSY;
11310 @@ -884,7 +1174,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11311         if (!cpu_present(cpu))
11312                 return -EINVAL;
11313
11314 +       /* Move the downtaker off the unplug cpu */
11315 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
11316 +               return -ENOMEM;
11317 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
11318 +               free_cpumask_var(cpumask);
11319 +               return -ENOMEM;
11320 +       }
11321 +
11322 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
11323 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
11324 +       set_cpus_allowed_ptr(current, cpumask);
11325 +       free_cpumask_var(cpumask);
11326 +       migrate_disable();
11327 +       mycpu = smp_processor_id();
11328 +       if (mycpu == cpu) {
11329 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
11330 +               migrate_enable();
11331 +               ret = -EBUSY;
11332 +               goto restore_cpus;
11333 +       }
11334 +
11335 +       migrate_enable();
11336         cpu_hotplug_begin();
11337 +       ret = cpu_unplug_begin(cpu);
11338 +       if (ret) {
11339 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
11340 +               goto out_cancel;
11341 +       }
11342
11343         cpuhp_tasks_frozen = tasks_frozen;
11344
11345 @@ -923,10 +1240,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
11346
11347         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
11348  out:
11349 +       cpu_unplug_done(cpu);
11350 +out_cancel:
11351         cpu_hotplug_done();
11352         /* This post dead nonsense must die */
11353         if (!ret && hasdied)
11354                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
11355 +restore_cpus:
11356 +       set_cpus_allowed_ptr(current, cpumask_org);
11357 +       free_cpumask_var(cpumask_org);
11358         return ret;
11359  }
11360
11361 diff --git a/kernel/cpuset.c b/kernel/cpuset.c
11362 index 29f815d2ef7e..341b17f24f95 100644
11363 --- a/kernel/cpuset.c
11364 +++ b/kernel/cpuset.c
11365 @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
11366   */
11367
11368  static DEFINE_MUTEX(cpuset_mutex);
11369 -static DEFINE_SPINLOCK(callback_lock);
11370 +static DEFINE_RAW_SPINLOCK(callback_lock);
11371
11372  static struct workqueue_struct *cpuset_migrate_mm_wq;
11373
11374 @@ -907,9 +907,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
11375                         continue;
11376                 rcu_read_unlock();
11377
11378 -               spin_lock_irq(&callback_lock);
11379 +               raw_spin_lock_irq(&callback_lock);
11380                 cpumask_copy(cp->effective_cpus, new_cpus);
11381 -               spin_unlock_irq(&callback_lock);
11382 +               raw_spin_unlock_irq(&callback_lock);
11383
11384                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11385                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
11386 @@ -974,9 +974,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
11387         if (retval < 0)
11388                 return retval;
11389
11390 -       spin_lock_irq(&callback_lock);
11391 +       raw_spin_lock_irq(&callback_lock);
11392         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
11393 -       spin_unlock_irq(&callback_lock);
11394 +       raw_spin_unlock_irq(&callback_lock);
11395
11396         /* use trialcs->cpus_allowed as a temp variable */
11397         update_cpumasks_hier(cs, trialcs->cpus_allowed);
11398 @@ -1176,9 +1176,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
11399                         continue;
11400                 rcu_read_unlock();
11401
11402 -               spin_lock_irq(&callback_lock);
11403 +               raw_spin_lock_irq(&callback_lock);
11404                 cp->effective_mems = *new_mems;
11405 -               spin_unlock_irq(&callback_lock);
11406 +               raw_spin_unlock_irq(&callback_lock);
11407
11408                 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
11409                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
11410 @@ -1246,9 +1246,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
11411         if (retval < 0)
11412                 goto done;
11413
11414 -       spin_lock_irq(&callback_lock);
11415 +       raw_spin_lock_irq(&callback_lock);
11416         cs->mems_allowed = trialcs->mems_allowed;
11417 -       spin_unlock_irq(&callback_lock);
11418 +       raw_spin_unlock_irq(&callback_lock);
11419
11420         /* use trialcs->mems_allowed as a temp variable */
11421         update_nodemasks_hier(cs, &trialcs->mems_allowed);
11422 @@ -1339,9 +1339,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
11423         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
11424                         || (is_spread_page(cs) != is_spread_page(trialcs)));
11425
11426 -       spin_lock_irq(&callback_lock);
11427 +       raw_spin_lock_irq(&callback_lock);
11428         cs->flags = trialcs->flags;
11429 -       spin_unlock_irq(&callback_lock);
11430 +       raw_spin_unlock_irq(&callback_lock);
11431
11432         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
11433                 rebuild_sched_domains_locked();
11434 @@ -1756,7 +1756,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11435         cpuset_filetype_t type = seq_cft(sf)->private;
11436         int ret = 0;
11437
11438 -       spin_lock_irq(&callback_lock);
11439 +       raw_spin_lock_irq(&callback_lock);
11440
11441         switch (type) {
11442         case FILE_CPULIST:
11443 @@ -1775,7 +1775,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
11444                 ret = -EINVAL;
11445         }
11446
11447 -       spin_unlock_irq(&callback_lock);
11448 +       raw_spin_unlock_irq(&callback_lock);
11449         return ret;
11450  }
11451
11452 @@ -1989,12 +1989,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
11453
11454         cpuset_inc();
11455
11456 -       spin_lock_irq(&callback_lock);
11457 +       raw_spin_lock_irq(&callback_lock);
11458         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11459                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
11460                 cs->effective_mems = parent->effective_mems;
11461         }
11462 -       spin_unlock_irq(&callback_lock);
11463 +       raw_spin_unlock_irq(&callback_lock);
11464
11465         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
11466                 goto out_unlock;
11467 @@ -2021,12 +2021,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
11468         }
11469         rcu_read_unlock();
11470
11471 -       spin_lock_irq(&callback_lock);
11472 +       raw_spin_lock_irq(&callback_lock);
11473         cs->mems_allowed = parent->mems_allowed;
11474         cs->effective_mems = parent->mems_allowed;
11475         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
11476         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
11477 -       spin_unlock_irq(&callback_lock);
11478 +       raw_spin_unlock_irq(&callback_lock);
11479  out_unlock:
11480         mutex_unlock(&cpuset_mutex);
11481         return 0;
11482 @@ -2065,7 +2065,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
11483  static void cpuset_bind(struct cgroup_subsys_state *root_css)
11484  {
11485         mutex_lock(&cpuset_mutex);
11486 -       spin_lock_irq(&callback_lock);
11487 +       raw_spin_lock_irq(&callback_lock);
11488
11489         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
11490                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
11491 @@ -2076,7 +2076,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
11492                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
11493         }
11494
11495 -       spin_unlock_irq(&callback_lock);
11496 +       raw_spin_unlock_irq(&callback_lock);
11497         mutex_unlock(&cpuset_mutex);
11498  }
11499
11500 @@ -2177,12 +2177,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
11501  {
11502         bool is_empty;
11503
11504 -       spin_lock_irq(&callback_lock);
11505 +       raw_spin_lock_irq(&callback_lock);
11506         cpumask_copy(cs->cpus_allowed, new_cpus);
11507         cpumask_copy(cs->effective_cpus, new_cpus);
11508         cs->mems_allowed = *new_mems;
11509         cs->effective_mems = *new_mems;
11510 -       spin_unlock_irq(&callback_lock);
11511 +       raw_spin_unlock_irq(&callback_lock);
11512
11513         /*
11514          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
11515 @@ -2219,10 +2219,10 @@ hotplug_update_tasks(struct cpuset *cs,
11516         if (nodes_empty(*new_mems))
11517                 *new_mems = parent_cs(cs)->effective_mems;
11518
11519 -       spin_lock_irq(&callback_lock);
11520 +       raw_spin_lock_irq(&callback_lock);
11521         cpumask_copy(cs->effective_cpus, new_cpus);
11522         cs->effective_mems = *new_mems;
11523 -       spin_unlock_irq(&callback_lock);
11524 +       raw_spin_unlock_irq(&callback_lock);
11525
11526         if (cpus_updated)
11527                 update_tasks_cpumask(cs);
11528 @@ -2308,21 +2308,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
11529
11530         /* synchronize cpus_allowed to cpu_active_mask */
11531         if (cpus_updated) {
11532 -               spin_lock_irq(&callback_lock);
11533 +               raw_spin_lock_irq(&callback_lock);
11534                 if (!on_dfl)
11535                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
11536                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
11537 -               spin_unlock_irq(&callback_lock);
11538 +               raw_spin_unlock_irq(&callback_lock);
11539                 /* we don't mess with cpumasks of tasks in top_cpuset */
11540         }
11541
11542         /* synchronize mems_allowed to N_MEMORY */
11543         if (mems_updated) {
11544 -               spin_lock_irq(&callback_lock);
11545 +               raw_spin_lock_irq(&callback_lock);
11546                 if (!on_dfl)
11547                         top_cpuset.mems_allowed = new_mems;
11548                 top_cpuset.effective_mems = new_mems;
11549 -               spin_unlock_irq(&callback_lock);
11550 +               raw_spin_unlock_irq(&callback_lock);
11551                 update_tasks_nodemask(&top_cpuset);
11552         }
11553
11554 @@ -2420,11 +2420,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
11555  {
11556         unsigned long flags;
11557
11558 -       spin_lock_irqsave(&callback_lock, flags);
11559 +       raw_spin_lock_irqsave(&callback_lock, flags);
11560         rcu_read_lock();
11561         guarantee_online_cpus(task_cs(tsk), pmask);
11562         rcu_read_unlock();
11563 -       spin_unlock_irqrestore(&callback_lock, flags);
11564 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11565  }
11566
11567  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
11568 @@ -2472,11 +2472,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
11569         nodemask_t mask;
11570         unsigned long flags;
11571
11572 -       spin_lock_irqsave(&callback_lock, flags);
11573 +       raw_spin_lock_irqsave(&callback_lock, flags);
11574         rcu_read_lock();
11575         guarantee_online_mems(task_cs(tsk), &mask);
11576         rcu_read_unlock();
11577 -       spin_unlock_irqrestore(&callback_lock, flags);
11578 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11579
11580         return mask;
11581  }
11582 @@ -2568,14 +2568,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
11583                 return true;
11584
11585         /* Not hardwall and node outside mems_allowed: scan up cpusets */
11586 -       spin_lock_irqsave(&callback_lock, flags);
11587 +       raw_spin_lock_irqsave(&callback_lock, flags);
11588
11589         rcu_read_lock();
11590         cs = nearest_hardwall_ancestor(task_cs(current));
11591         allowed = node_isset(node, cs->mems_allowed);
11592         rcu_read_unlock();
11593
11594 -       spin_unlock_irqrestore(&callback_lock, flags);
11595 +       raw_spin_unlock_irqrestore(&callback_lock, flags);
11596         return allowed;
11597  }
11598
11599 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
11600 index fc1ef736253c..83c666537a7a 100644
11601 --- a/kernel/debug/kdb/kdb_io.c
11602 +++ b/kernel/debug/kdb/kdb_io.c
11603 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11604         int linecount;
11605         int colcount;
11606         int logging, saved_loglevel = 0;
11607 -       int saved_trap_printk;
11608         int got_printf_lock = 0;
11609         int retlen = 0;
11610         int fnd, len;
11611 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11612         unsigned long uninitialized_var(flags);
11613
11614         preempt_disable();
11615 -       saved_trap_printk = kdb_trap_printk;
11616 -       kdb_trap_printk = 0;
11617
11618         /* Serialize kdb_printf if multiple cpus try to write at once.
11619          * But if any cpu goes recursive in kdb, just print the output,
11620 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
11621         } else {
11622                 __release(kdb_printf_lock);
11623         }
11624 -       kdb_trap_printk = saved_trap_printk;
11625         preempt_enable();
11626         return retlen;
11627  }
11628 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
11629         va_list ap;
11630         int r;
11631
11632 +       kdb_trap_printk++;
11633         va_start(ap, fmt);
11634         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
11635         va_end(ap);
11636 +       kdb_trap_printk--;
11637
11638         return r;
11639  }
11640 diff --git a/kernel/events/core.c b/kernel/events/core.c
11641 index 02c8421f8c01..3748cb7b2d6e 100644
11642 --- a/kernel/events/core.c
11643 +++ b/kernel/events/core.c
11644 @@ -1050,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
11645         raw_spin_lock_init(&cpuctx->hrtimer_lock);
11646         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
11647         timer->function = perf_mux_hrtimer_handler;
11648 +       timer->irqsafe = 1;
11649  }
11650
11651  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
11652 @@ -8335,6 +8336,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
11653
11654         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
11655         hwc->hrtimer.function = perf_swevent_hrtimer;
11656 +       hwc->hrtimer.irqsafe = 1;
11657
11658         /*
11659          * Since hrtimers have a fixed rate, we can do a static freq->period
11660 diff --git a/kernel/exit.c b/kernel/exit.c
11661 index 3076f3089919..fb2ebcf3ca7c 100644
11662 --- a/kernel/exit.c
11663 +++ b/kernel/exit.c
11664 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
11665          * Do this under ->siglock, we can race with another thread
11666          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
11667          */
11668 -       flush_sigqueue(&tsk->pending);
11669 +       flush_task_sigqueue(tsk);
11670         tsk->sighand = NULL;
11671         spin_unlock(&sighand->siglock);
11672
11673 diff --git a/kernel/fork.c b/kernel/fork.c
11674 index ba8a01564985..47784f8aed37 100644
11675 --- a/kernel/fork.c
11676 +++ b/kernel/fork.c
11677 @@ -76,6 +76,7 @@
11678  #include <linux/compiler.h>
11679  #include <linux/sysctl.h>
11680  #include <linux/kcov.h>
11681 +#include <linux/kprobes.h>
11682
11683  #include <asm/pgtable.h>
11684  #include <asm/pgalloc.h>
11685 @@ -376,13 +377,24 @@ static inline void put_signal_struct(struct signal_struct *sig)
11686         if (atomic_dec_and_test(&sig->sigcnt))
11687                 free_signal_struct(sig);
11688  }
11689 -
11690 +#ifdef CONFIG_PREEMPT_RT_BASE
11691 +static
11692 +#endif
11693  void __put_task_struct(struct task_struct *tsk)
11694  {
11695         WARN_ON(!tsk->exit_state);
11696         WARN_ON(atomic_read(&tsk->usage));
11697         WARN_ON(tsk == current);
11698
11699 +       /*
11700 +        * Remove function-return probe instances associated with this
11701 +        * task and put them back on the free list.
11702 +        */
11703 +       kprobe_flush_task(tsk);
11704 +
11705 +       /* Task is done with its stack. */
11706 +       put_task_stack(tsk);
11707 +
11708         cgroup_free(tsk);
11709         task_numa_free(tsk);
11710         security_task_free(tsk);
11711 @@ -393,7 +405,18 @@ void __put_task_struct(struct task_struct *tsk)
11712         if (!profile_handoff_task(tsk))
11713                 free_task(tsk);
11714  }
11715 +#ifndef CONFIG_PREEMPT_RT_BASE
11716  EXPORT_SYMBOL_GPL(__put_task_struct);
11717 +#else
11718 +void __put_task_struct_cb(struct rcu_head *rhp)
11719 +{
11720 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
11721 +
11722 +       __put_task_struct(tsk);
11723 +
11724 +}
11725 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
11726 +#endif
11727
11728  void __init __weak arch_task_cache_init(void) { }
11729
11730 @@ -852,6 +875,19 @@ void __mmdrop(struct mm_struct *mm)
11731  }
11732  EXPORT_SYMBOL_GPL(__mmdrop);
11733
11734 +#ifdef CONFIG_PREEMPT_RT_BASE
11735 +/*
11736 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
11737 + * want another facility to make this work.
11738 + */
11739 +void __mmdrop_delayed(struct rcu_head *rhp)
11740 +{
11741 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
11742 +
11743 +       __mmdrop(mm);
11744 +}
11745 +#endif
11746 +
11747  static inline void __mmput(struct mm_struct *mm)
11748  {
11749         VM_BUG_ON(atomic_read(&mm->mm_users));
11750 @@ -1426,6 +1462,9 @@ static void rt_mutex_init_task(struct task_struct *p)
11751   */
11752  static void posix_cpu_timers_init(struct task_struct *tsk)
11753  {
11754 +#ifdef CONFIG_PREEMPT_RT_BASE
11755 +       tsk->posix_timer_list = NULL;
11756 +#endif
11757         tsk->cputime_expires.prof_exp = 0;
11758         tsk->cputime_expires.virt_exp = 0;
11759         tsk->cputime_expires.sched_exp = 0;
11760 @@ -1552,6 +1591,7 @@ static __latent_entropy struct task_struct *copy_process(
11761         spin_lock_init(&p->alloc_lock);
11762
11763         init_sigpending(&p->pending);
11764 +       p->sigqueue_cache = NULL;
11765
11766         p->utime = p->stime = p->gtime = 0;
11767         p->utimescaled = p->stimescaled = 0;
11768 diff --git a/kernel/futex.c b/kernel/futex.c
11769 index 2c4be467fecd..064917c2d9a5 100644
11770 --- a/kernel/futex.c
11771 +++ b/kernel/futex.c
11772 @@ -904,7 +904,9 @@ void exit_pi_state_list(struct task_struct *curr)
11773                  * task still owns the PI-state:
11774                  */
11775                 if (head->next != next) {
11776 +                       raw_spin_unlock_irq(&curr->pi_lock);
11777                         spin_unlock(&hb->lock);
11778 +                       raw_spin_lock_irq(&curr->pi_lock);
11779                         continue;
11780                 }
11781
11782 @@ -1299,6 +1301,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11783         struct futex_pi_state *pi_state = this->pi_state;
11784         u32 uninitialized_var(curval), newval;
11785         WAKE_Q(wake_q);
11786 +       WAKE_Q(wake_sleeper_q);
11787         bool deboost;
11788         int ret = 0;
11789
11790 @@ -1365,7 +1368,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11791
11792         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
11793
11794 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
11795 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
11796 +                                       &wake_sleeper_q);
11797
11798         /*
11799          * First unlock HB so the waiter does not spin on it once he got woken
11800 @@ -1373,8 +1377,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
11801          * deboost first (and lose our higher priority), then the task might get
11802          * scheduled away before the wake up can take place.
11803          */
11804 -       spin_unlock(&hb->lock);
11805 +       deboost |= spin_unlock_no_deboost(&hb->lock);
11806         wake_up_q(&wake_q);
11807 +       wake_up_q_sleeper(&wake_sleeper_q);
11808         if (deboost)
11809                 rt_mutex_adjust_prio(current);
11810
11811 @@ -1924,6 +1929,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
11812                                 requeue_pi_wake_futex(this, &key2, hb2);
11813                                 drop_count++;
11814                                 continue;
11815 +                       } else if (ret == -EAGAIN) {
11816 +                               /*
11817 +                                * Waiter was woken by timeout or
11818 +                                * signal and has set pi_blocked_on to
11819 +                                * PI_WAKEUP_INPROGRESS before we
11820 +                                * tried to enqueue it on the rtmutex.
11821 +                                */
11822 +                               this->pi_state = NULL;
11823 +                               put_pi_state(pi_state);
11824 +                               continue;
11825                         } else if (ret) {
11826                                 /*
11827                                  * rt_mutex_start_proxy_lock() detected a
11828 @@ -2814,7 +2829,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11829         struct hrtimer_sleeper timeout, *to = NULL;
11830         struct rt_mutex_waiter rt_waiter;
11831         struct rt_mutex *pi_mutex = NULL;
11832 -       struct futex_hash_bucket *hb;
11833 +       struct futex_hash_bucket *hb, *hb2;
11834         union futex_key key2 = FUTEX_KEY_INIT;
11835         struct futex_q q = futex_q_init;
11836         int res, ret;
11837 @@ -2839,10 +2854,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11838          * The waiter is allocated on our stack, manipulated by the requeue
11839          * code while we sleep on uaddr.
11840          */
11841 -       debug_rt_mutex_init_waiter(&rt_waiter);
11842 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
11843 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
11844 -       rt_waiter.task = NULL;
11845 +       rt_mutex_init_waiter(&rt_waiter, false);
11846
11847         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
11848         if (unlikely(ret != 0))
11849 @@ -2873,20 +2885,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11850         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
11851         futex_wait_queue_me(hb, &q, to);
11852
11853 -       spin_lock(&hb->lock);
11854 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11855 -       spin_unlock(&hb->lock);
11856 -       if (ret)
11857 -               goto out_put_keys;
11858 +       /*
11859 +        * On RT we must avoid races with requeue and trying to block
11860 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
11861 +        * serializing access to pi_blocked_on with pi_lock.
11862 +        */
11863 +       raw_spin_lock_irq(&current->pi_lock);
11864 +       if (current->pi_blocked_on) {
11865 +               /*
11866 +                * We have been requeued or are in the process of
11867 +                * being requeued.
11868 +                */
11869 +               raw_spin_unlock_irq(&current->pi_lock);
11870 +       } else {
11871 +               /*
11872 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
11873 +                * prevents a concurrent requeue from moving us to the
11874 +                * uaddr2 rtmutex. After that we can safely acquire
11875 +                * (and possibly block on) hb->lock.
11876 +                */
11877 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
11878 +               raw_spin_unlock_irq(&current->pi_lock);
11879 +
11880 +               spin_lock(&hb->lock);
11881 +
11882 +               /*
11883 +                * Clean up pi_blocked_on. We might leak it otherwise
11884 +                * when we succeeded with the hb->lock in the fast
11885 +                * path.
11886 +                */
11887 +               raw_spin_lock_irq(&current->pi_lock);
11888 +               current->pi_blocked_on = NULL;
11889 +               raw_spin_unlock_irq(&current->pi_lock);
11890 +
11891 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
11892 +               spin_unlock(&hb->lock);
11893 +               if (ret)
11894 +                       goto out_put_keys;
11895 +       }
11896
11897         /*
11898 -        * In order for us to be here, we know our q.key == key2, and since
11899 -        * we took the hb->lock above, we also know that futex_requeue() has
11900 -        * completed and we no longer have to concern ourselves with a wakeup
11901 -        * race with the atomic proxy lock acquisition by the requeue code. The
11902 -        * futex_requeue dropped our key1 reference and incremented our key2
11903 -        * reference count.
11904 +        * In order to be here, we have either been requeued, are in
11905 +        * the process of being requeued, or requeue successfully
11906 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
11907 +        * non-null above, we may be racing with a requeue.  Do not
11908 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
11909 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
11910 +        * reference and incremented our key2 reference count.
11911          */
11912 +       hb2 = hash_futex(&key2);
11913
11914         /* Check if the requeue code acquired the second futex for us. */
11915         if (!q.rt_waiter) {
11916 @@ -2895,14 +2942,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11917                  * did a lock-steal - fix up the PI-state in that case.
11918                  */
11919                 if (q.pi_state && (q.pi_state->owner != current)) {
11920 -                       spin_lock(q.lock_ptr);
11921 +                       spin_lock(&hb2->lock);
11922 +                       BUG_ON(&hb2->lock != q.lock_ptr);
11923                         ret = fixup_pi_state_owner(uaddr2, &q, current);
11924                         /*
11925                          * Drop the reference to the pi state which
11926                          * the requeue_pi() code acquired for us.
11927                          */
11928                         put_pi_state(q.pi_state);
11929 -                       spin_unlock(q.lock_ptr);
11930 +                       spin_unlock(&hb2->lock);
11931                 }
11932         } else {
11933                 /*
11934 @@ -2915,7 +2963,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
11935                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
11936                 debug_rt_mutex_free_waiter(&rt_waiter);
11937
11938 -               spin_lock(q.lock_ptr);
11939 +               spin_lock(&hb2->lock);
11940 +               BUG_ON(&hb2->lock != q.lock_ptr);
11941                 /*
11942                  * Fixup the pi_state owner and possibly acquire the lock if we
11943                  * haven't already.
11944 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
11945 index d3f24905852c..f87aa8fdcc51 100644
11946 --- a/kernel/irq/handle.c
11947 +++ b/kernel/irq/handle.c
11948 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
11949  {
11950         irqreturn_t retval;
11951         unsigned int flags = 0;
11952 +       struct pt_regs *regs = get_irq_regs();
11953 +       u64 ip = regs ? instruction_pointer(regs) : 0;
11954
11955         retval = __handle_irq_event_percpu(desc, &flags);
11956
11957 -       add_interrupt_randomness(desc->irq_data.irq, flags);
11958 +#ifdef CONFIG_PREEMPT_RT_FULL
11959 +       desc->random_ip = ip;
11960 +#else
11961 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
11962 +#endif
11963
11964         if (!noirqdebug)
11965                 note_interrupt(desc, retval);
11966 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
11967 index 6b669593e7eb..e357bf6c59d5 100644
11968 --- a/kernel/irq/manage.c
11969 +++ b/kernel/irq/manage.c
11970 @@ -22,6 +22,7 @@
11971  #include "internals.h"
11972
11973  #ifdef CONFIG_IRQ_FORCED_THREADING
11974 +# ifndef CONFIG_PREEMPT_RT_BASE
11975  __read_mostly bool force_irqthreads;
11976
11977  static int __init setup_forced_irqthreads(char *arg)
11978 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
11979         return 0;
11980  }
11981  early_param("threadirqs", setup_forced_irqthreads);
11982 +# endif
11983  #endif
11984
11985  static void __synchronize_hardirq(struct irq_desc *desc)
11986 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
11987
11988         if (desc->affinity_notify) {
11989                 kref_get(&desc->affinity_notify->kref);
11990 +
11991 +#ifdef CONFIG_PREEMPT_RT_BASE
11992 +               swork_queue(&desc->affinity_notify->swork);
11993 +#else
11994                 schedule_work(&desc->affinity_notify->work);
11995 +#endif
11996         }
11997         irqd_set(data, IRQD_AFFINITY_SET);
11998
11999 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
12000  }
12001  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
12002
12003 -static void irq_affinity_notify(struct work_struct *work)
12004 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
12005  {
12006 -       struct irq_affinity_notify *notify =
12007 -               container_of(work, struct irq_affinity_notify, work);
12008         struct irq_desc *desc = irq_to_desc(notify->irq);
12009         cpumask_var_t cpumask;
12010         unsigned long flags;
12011 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
12012         kref_put(&notify->kref, notify->release);
12013  }
12014
12015 +#ifdef CONFIG_PREEMPT_RT_BASE
12016 +static void init_helper_thread(void)
12017 +{
12018 +       static int init_sworker_once;
12019 +
12020 +       if (init_sworker_once)
12021 +               return;
12022 +       if (WARN_ON(swork_get()))
12023 +               return;
12024 +       init_sworker_once = 1;
12025 +}
12026 +
12027 +static void irq_affinity_notify(struct swork_event *swork)
12028 +{
12029 +       struct irq_affinity_notify *notify =
12030 +               container_of(swork, struct irq_affinity_notify, swork);
12031 +       _irq_affinity_notify(notify);
12032 +}
12033 +
12034 +#else
12035 +
12036 +static void irq_affinity_notify(struct work_struct *work)
12037 +{
12038 +       struct irq_affinity_notify *notify =
12039 +               container_of(work, struct irq_affinity_notify, work);
12040 +       _irq_affinity_notify(notify);
12041 +}
12042 +#endif
12043 +
12044  /**
12045   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
12046   *     @irq:           Interrupt for which to enable/disable notification
12047 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
12048         if (notify) {
12049                 notify->irq = irq;
12050                 kref_init(&notify->kref);
12051 +#ifdef CONFIG_PREEMPT_RT_BASE
12052 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
12053 +               init_helper_thread();
12054 +#else
12055                 INIT_WORK(&notify->work, irq_affinity_notify);
12056 +#endif
12057         }
12058
12059         raw_spin_lock_irqsave(&desc->lock, flags);
12060 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
12061         local_bh_disable();
12062         ret = action->thread_fn(action->irq, action->dev_id);
12063         irq_finalize_oneshot(desc, action);
12064 -       local_bh_enable();
12065 +       /*
12066 +        * Interrupts which have real time requirements can be set up
12067 +        * to avoid softirq processing in the thread handler. This is
12068 +        * safe as these interrupts do not raise soft interrupts.
12069 +        */
12070 +       if (irq_settings_no_softirq_call(desc))
12071 +               _local_bh_enable();
12072 +       else
12073 +               local_bh_enable();
12074         return ret;
12075  }
12076
12077 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
12078                 if (action_ret == IRQ_WAKE_THREAD)
12079                         irq_wake_secondary(desc, action);
12080
12081 +#ifdef CONFIG_PREEMPT_RT_FULL
12082 +               migrate_disable();
12083 +               add_interrupt_randomness(action->irq, 0,
12084 +                                desc->random_ip ^ (unsigned long) action);
12085 +               migrate_enable();
12086 +#endif
12087                 wake_threads_waitq(desc);
12088         }
12089
12090 @@ -1336,6 +1389,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
12091                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
12092                 }
12093
12094 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
12095 +                       irq_settings_set_no_softirq_call(desc);
12096 +
12097                 /* Set default affinity mask once everything is setup */
12098                 setup_affinity(desc, mask);
12099
12100 @@ -2061,7 +2117,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
12101   *     This call sets the internal irqchip state of an interrupt,
12102   *     depending on the value of @which.
12103   *
12104 - *     This function should be called with preemption disabled if the
12105 + *     This function should be called with migration disabled if the
12106   *     interrupt controller has per-cpu registers.
12107   */
12108  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
12109 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
12110 index 320579d89091..2df2d4445b1e 100644
12111 --- a/kernel/irq/settings.h
12112 +++ b/kernel/irq/settings.h
12113 @@ -16,6 +16,7 @@ enum {
12114         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
12115         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
12116         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
12117 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
12118         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
12119  };
12120
12121 @@ -30,6 +31,7 @@ enum {
12122  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
12123  #define IRQ_IS_POLLED          GOT_YOU_MORON
12124  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
12125 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
12126  #undef IRQF_MODIFY_MASK
12127  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
12128
12129 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
12130         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
12131  }
12132
12133 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
12134 +{
12135 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
12136 +}
12137 +
12138 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
12139 +{
12140 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
12141 +}
12142 +
12143  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
12144  {
12145         return desc->status_use_accessors & _IRQ_PER_CPU;
12146 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
12147 index 5707f97a3e6a..73f38dc7a7fb 100644
12148 --- a/kernel/irq/spurious.c
12149 +++ b/kernel/irq/spurious.c
12150 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
12151
12152  static int __init irqfixup_setup(char *str)
12153  {
12154 +#ifdef CONFIG_PREEMPT_RT_BASE
12155 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
12156 +       return 1;
12157 +#endif
12158         irqfixup = 1;
12159         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
12160         printk(KERN_WARNING "This may impact system performance.\n");
12161 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
12162
12163  static int __init irqpoll_setup(char *str)
12164  {
12165 +#ifdef CONFIG_PREEMPT_RT_BASE
12166 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
12167 +       return 1;
12168 +#endif
12169         irqfixup = 2;
12170         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
12171                                 "enabled\n");
12172 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
12173 index bcf107ce0854..2899ba0d23d1 100644
12174 --- a/kernel/irq_work.c
12175 +++ b/kernel/irq_work.c
12176 @@ -17,6 +17,7 @@
12177  #include <linux/cpu.h>
12178  #include <linux/notifier.h>
12179  #include <linux/smp.h>
12180 +#include <linux/interrupt.h>
12181  #include <asm/processor.h>
12182
12183
12184 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
12185   */
12186  bool irq_work_queue_on(struct irq_work *work, int cpu)
12187  {
12188 +       struct llist_head *list;
12189 +
12190         /* All work should have been flushed before going offline */
12191         WARN_ON_ONCE(cpu_is_offline(cpu));
12192
12193 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
12194         if (!irq_work_claim(work))
12195                 return false;
12196
12197 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
12198 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
12199 +               list = &per_cpu(lazy_list, cpu);
12200 +       else
12201 +               list = &per_cpu(raised_list, cpu);
12202 +
12203 +       if (llist_add(&work->llnode, list))
12204                 arch_send_call_function_single_ipi(cpu);
12205
12206         return true;
12207 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
12208  /* Enqueue the irq work @work on the current CPU */
12209  bool irq_work_queue(struct irq_work *work)
12210  {
12211 +       struct llist_head *list;
12212 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
12213 +
12214         /* Only queue if not already pending */
12215         if (!irq_work_claim(work))
12216                 return false;
12217 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
12218         /* Queue the entry and raise the IPI if needed. */
12219         preempt_disable();
12220
12221 -       /* If the work is "lazy", handle it from next tick if any */
12222 -       if (work->flags & IRQ_WORK_LAZY) {
12223 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
12224 -                   tick_nohz_tick_stopped())
12225 -                       arch_irq_work_raise();
12226 -       } else {
12227 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
12228 +       lazy_work = work->flags & IRQ_WORK_LAZY;
12229 +
12230 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
12231 +               list = this_cpu_ptr(&lazy_list);
12232 +       else
12233 +               list = this_cpu_ptr(&raised_list);
12234 +
12235 +       if (llist_add(&work->llnode, list)) {
12236 +               if (!lazy_work || tick_nohz_tick_stopped())
12237                         arch_irq_work_raise();
12238         }
12239
12240 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
12241         raised = this_cpu_ptr(&raised_list);
12242         lazy = this_cpu_ptr(&lazy_list);
12243
12244 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
12245 -               if (llist_empty(lazy))
12246 -                       return false;
12247 +       if (llist_empty(raised) && llist_empty(lazy))
12248 +               return false;
12249
12250         /* All work should have been flushed before going offline */
12251         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
12252 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
12253         struct irq_work *work;
12254         struct llist_node *llnode;
12255
12256 -       BUG_ON(!irqs_disabled());
12257 +       BUG_ON_NONRT(!irqs_disabled());
12258
12259         if (llist_empty(list))
12260                 return;
12261 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
12262  void irq_work_run(void)
12263  {
12264         irq_work_run_list(this_cpu_ptr(&raised_list));
12265 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
12266 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
12267 +               /*
12268 +                * NOTE: we raise softirq via IPI for safety,
12269 +                * and execute in irq_work_tick() to move the
12270 +                * overhead from hard to soft irq context.
12271 +                */
12272 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
12273 +                       raise_softirq(TIMER_SOFTIRQ);
12274 +       } else
12275 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
12276  }
12277  EXPORT_SYMBOL_GPL(irq_work_run);
12278
12279 @@ -179,8 +200,17 @@ void irq_work_tick(void)
12280
12281         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
12282                 irq_work_run_list(raised);
12283 +
12284 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
12285 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
12286 +}
12287 +
12288 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
12289 +void irq_work_tick_soft(void)
12290 +{
12291         irq_work_run_list(this_cpu_ptr(&lazy_list));
12292  }
12293 +#endif
12294
12295  /*
12296   * Synchronize against the irq_work @entry, ensures the entry is not
12297 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
12298 index ee1bc1bb8feb..ddef07958840 100644
12299 --- a/kernel/ksysfs.c
12300 +++ b/kernel/ksysfs.c
12301 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
12302
12303  #endif /* CONFIG_KEXEC_CORE */
12304
12305 +#if defined(CONFIG_PREEMPT_RT_FULL)
12306 +static ssize_t  realtime_show(struct kobject *kobj,
12307 +                             struct kobj_attribute *attr, char *buf)
12308 +{
12309 +       return sprintf(buf, "%d\n", 1);
12310 +}
12311 +KERNEL_ATTR_RO(realtime);
12312 +#endif
12313 +
12314  /* whether file capabilities are enabled */
12315  static ssize_t fscaps_show(struct kobject *kobj,
12316                                   struct kobj_attribute *attr, char *buf)
12317 @@ -225,6 +234,9 @@ static struct attribute * kernel_attrs[] = {
12318         &rcu_expedited_attr.attr,
12319         &rcu_normal_attr.attr,
12320  #endif
12321 +#ifdef CONFIG_PREEMPT_RT_FULL
12322 +       &realtime_attr.attr,
12323 +#endif
12324         NULL
12325  };
12326
12327 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
12328 index 6f88e352cd4f..5e27fb1079e7 100644
12329 --- a/kernel/locking/Makefile
12330 +++ b/kernel/locking/Makefile
12331 @@ -2,7 +2,7 @@
12332  # and is generally not a function of system call inputs.
12333  KCOV_INSTRUMENT                := n
12334
12335 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
12336 +obj-y += semaphore.o percpu-rwsem.o
12337
12338  ifdef CONFIG_FUNCTION_TRACER
12339  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
12340 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
12341  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
12342  endif
12343
12344 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
12345 +obj-y += mutex.o
12346  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
12347 +obj-y += rwsem.o
12348 +endif
12349  obj-$(CONFIG_LOCKDEP) += lockdep.o
12350  ifeq ($(CONFIG_PROC_FS),y)
12351  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
12352 @@ -24,7 +28,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
12353  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
12354  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
12355  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
12356 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
12357  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
12358  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
12359 +endif
12360 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
12361  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
12362  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
12363 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
12364 index 4d7ffc0a0d00..9e52009c192e 100644
12365 --- a/kernel/locking/lockdep.c
12366 +++ b/kernel/locking/lockdep.c
12367 @@ -3689,6 +3689,7 @@ static void check_flags(unsigned long flags)
12368                 }
12369         }
12370
12371 +#ifndef CONFIG_PREEMPT_RT_FULL
12372         /*
12373          * We dont accurately track softirq state in e.g.
12374          * hardirq contexts (such as on 4KSTACKS), so only
12375 @@ -3703,6 +3704,7 @@ static void check_flags(unsigned long flags)
12376                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
12377                 }
12378         }
12379 +#endif
12380
12381         if (!debug_locks)
12382                 print_irqtrace_events(current);
12383 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
12384 index f8c5af52a131..788068773e61 100644
12385 --- a/kernel/locking/locktorture.c
12386 +++ b/kernel/locking/locktorture.c
12387 @@ -26,7 +26,6 @@
12388  #include <linux/kthread.h>
12389  #include <linux/sched/rt.h>
12390  #include <linux/spinlock.h>
12391 -#include <linux/rwlock.h>
12392  #include <linux/mutex.h>
12393  #include <linux/rwsem.h>
12394  #include <linux/smp.h>
12395 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
12396 index ce182599cf2e..2ad3a1e8344c 100644
12397 --- a/kernel/locking/percpu-rwsem.c
12398 +++ b/kernel/locking/percpu-rwsem.c
12399 @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
12400         /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
12401         rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
12402         __init_rwsem(&sem->rw_sem, name, rwsem_key);
12403 -       init_waitqueue_head(&sem->writer);
12404 +       init_swait_queue_head(&sem->writer);
12405         sem->readers_block = 0;
12406         return 0;
12407  }
12408 @@ -103,7 +103,7 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
12409         __this_cpu_dec(*sem->read_count);
12410
12411         /* Prod writer to recheck readers_active */
12412 -       wake_up(&sem->writer);
12413 +       swake_up(&sem->writer);
12414  }
12415  EXPORT_SYMBOL_GPL(__percpu_up_read);
12416
12417 @@ -160,7 +160,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
12418          */
12419
12420         /* Wait for all now active readers to complete. */
12421 -       wait_event(sem->writer, readers_active_check(sem));
12422 +       swait_event(sem->writer, readers_active_check(sem));
12423  }
12424  EXPORT_SYMBOL_GPL(percpu_down_write);
12425
12426 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
12427 new file mode 100644
12428 index 000000000000..665754c00e1e
12429 --- /dev/null
12430 +++ b/kernel/locking/rt.c
12431 @@ -0,0 +1,498 @@
12432 +/*
12433 + * kernel/rt.c
12434 + *
12435 + * Real-Time Preemption Support
12436 + *
12437 + * started by Ingo Molnar:
12438 + *
12439 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
12440 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12441 + *
12442 + * historic credit for proving that Linux spinlocks can be implemented via
12443 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
12444 + * and others) who prototyped it on 2.4 and did lots of comparative
12445 + * research and analysis; TimeSys, for proving that you can implement a
12446 + * fully preemptible kernel via the use of IRQ threading and mutexes;
12447 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
12448 + * right one; and to MontaVista, who ported pmutexes to 2.6.
12449 + *
12450 + * This code is a from-scratch implementation and is not based on pmutexes,
12451 + * but the idea of converting spinlocks to mutexes is used here too.
12452 + *
12453 + * lock debugging, locking tree, deadlock detection:
12454 + *
12455 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
12456 + *  Released under the General Public License (GPL).
12457 + *
12458 + * Includes portions of the generic R/W semaphore implementation from:
12459 + *
12460 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
12461 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
12462 + *  - Derived also from comments by Linus
12463 + *
12464 + * Pending ownership of locks and ownership stealing:
12465 + *
12466 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
12467 + *
12468 + *   (also by Steven Rostedt)
12469 + *    - Converted single pi_lock to individual task locks.
12470 + *
12471 + * By Esben Nielsen:
12472 + *    Doing priority inheritance with help of the scheduler.
12473 + *
12474 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12475 + *  - major rework based on Esben Nielsens initial patch
12476 + *  - replaced thread_info references by task_struct refs
12477 + *  - removed task->pending_owner dependency
12478 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
12479 + *    in the scheduler return path as discussed with Steven Rostedt
12480 + *
12481 + *  Copyright (C) 2006, Kihon Technologies Inc.
12482 + *    Steven Rostedt <rostedt@goodmis.org>
12483 + *  - debugged and patched Thomas Gleixner's rework.
12484 + *  - added back the cmpxchg to the rework.
12485 + *  - turned atomic require back on for SMP.
12486 + */
12487 +
12488 +#include <linux/spinlock.h>
12489 +#include <linux/rtmutex.h>
12490 +#include <linux/sched.h>
12491 +#include <linux/delay.h>
12492 +#include <linux/module.h>
12493 +#include <linux/kallsyms.h>
12494 +#include <linux/syscalls.h>
12495 +#include <linux/interrupt.h>
12496 +#include <linux/plist.h>
12497 +#include <linux/fs.h>
12498 +#include <linux/futex.h>
12499 +#include <linux/hrtimer.h>
12500 +
12501 +#include "rtmutex_common.h"
12502 +
12503 +/*
12504 + * struct mutex functions
12505 + */
12506 +void __mutex_do_init(struct mutex *mutex, const char *name,
12507 +                    struct lock_class_key *key)
12508 +{
12509 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12510 +       /*
12511 +        * Make sure we are not reinitializing a held lock:
12512 +        */
12513 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
12514 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
12515 +#endif
12516 +       mutex->lock.save_state = 0;
12517 +}
12518 +EXPORT_SYMBOL(__mutex_do_init);
12519 +
12520 +void __lockfunc _mutex_lock(struct mutex *lock)
12521 +{
12522 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12523 +       rt_mutex_lock(&lock->lock);
12524 +}
12525 +EXPORT_SYMBOL(_mutex_lock);
12526 +
12527 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
12528 +{
12529 +       int ret;
12530 +
12531 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12532 +       ret = rt_mutex_lock_interruptible(&lock->lock);
12533 +       if (ret)
12534 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12535 +       return ret;
12536 +}
12537 +EXPORT_SYMBOL(_mutex_lock_interruptible);
12538 +
12539 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
12540 +{
12541 +       int ret;
12542 +
12543 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
12544 +       ret = rt_mutex_lock_killable(&lock->lock);
12545 +       if (ret)
12546 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12547 +       return ret;
12548 +}
12549 +EXPORT_SYMBOL(_mutex_lock_killable);
12550 +
12551 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12552 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
12553 +{
12554 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
12555 +       rt_mutex_lock(&lock->lock);
12556 +}
12557 +EXPORT_SYMBOL(_mutex_lock_nested);
12558 +
12559 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
12560 +{
12561 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
12562 +       rt_mutex_lock(&lock->lock);
12563 +}
12564 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
12565 +
12566 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
12567 +{
12568 +       int ret;
12569 +
12570 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
12571 +       ret = rt_mutex_lock_interruptible(&lock->lock);
12572 +       if (ret)
12573 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12574 +       return ret;
12575 +}
12576 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
12577 +
12578 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
12579 +{
12580 +       int ret;
12581 +
12582 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
12583 +       ret = rt_mutex_lock_killable(&lock->lock);
12584 +       if (ret)
12585 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
12586 +       return ret;
12587 +}
12588 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
12589 +#endif
12590 +
12591 +int __lockfunc _mutex_trylock(struct mutex *lock)
12592 +{
12593 +       int ret = rt_mutex_trylock(&lock->lock);
12594 +
12595 +       if (ret)
12596 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
12597 +
12598 +       return ret;
12599 +}
12600 +EXPORT_SYMBOL(_mutex_trylock);
12601 +
12602 +void __lockfunc _mutex_unlock(struct mutex *lock)
12603 +{
12604 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
12605 +       rt_mutex_unlock(&lock->lock);
12606 +}
12607 +EXPORT_SYMBOL(_mutex_unlock);
12608 +
12609 +/*
12610 + * rwlock_t functions
12611 + */
12612 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
12613 +{
12614 +       int ret;
12615 +
12616 +       migrate_disable();
12617 +       ret = rt_mutex_trylock(&rwlock->lock);
12618 +       if (ret)
12619 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
12620 +       else
12621 +               migrate_enable();
12622 +
12623 +       return ret;
12624 +}
12625 +EXPORT_SYMBOL(rt_write_trylock);
12626 +
12627 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
12628 +{
12629 +       int ret;
12630 +
12631 +       *flags = 0;
12632 +       ret = rt_write_trylock(rwlock);
12633 +       return ret;
12634 +}
12635 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
12636 +
12637 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
12638 +{
12639 +       struct rt_mutex *lock = &rwlock->lock;
12640 +       int ret = 1;
12641 +
12642 +       /*
12643 +        * recursive read locks succeed when current owns the lock,
12644 +        * but not when read_depth == 0 which means that the lock is
12645 +        * write locked.
12646 +        */
12647 +       if (rt_mutex_owner(lock) != current) {
12648 +               migrate_disable();
12649 +               ret = rt_mutex_trylock(lock);
12650 +               if (ret)
12651 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
12652 +               else
12653 +                       migrate_enable();
12654 +
12655 +       } else if (!rwlock->read_depth) {
12656 +               ret = 0;
12657 +       }
12658 +
12659 +       if (ret)
12660 +               rwlock->read_depth++;
12661 +
12662 +       return ret;
12663 +}
12664 +EXPORT_SYMBOL(rt_read_trylock);
12665 +
12666 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
12667 +{
12668 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
12669 +       __rt_spin_lock(&rwlock->lock);
12670 +}
12671 +EXPORT_SYMBOL(rt_write_lock);
12672 +
12673 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
12674 +{
12675 +       struct rt_mutex *lock = &rwlock->lock;
12676 +
12677 +
12678 +       /*
12679 +        * recursive read locks succeed when current owns the lock
12680 +        */
12681 +       if (rt_mutex_owner(lock) != current) {
12682 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
12683 +               __rt_spin_lock(lock);
12684 +       }
12685 +       rwlock->read_depth++;
12686 +}
12687 +
12688 +EXPORT_SYMBOL(rt_read_lock);
12689 +
12690 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
12691 +{
12692 +       /* NOTE: we always pass in '1' for nested, for simplicity */
12693 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
12694 +       __rt_spin_unlock(&rwlock->lock);
12695 +       migrate_enable();
12696 +}
12697 +EXPORT_SYMBOL(rt_write_unlock);
12698 +
12699 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
12700 +{
12701 +       /* Release the lock only when read_depth is down to 0 */
12702 +       if (--rwlock->read_depth == 0) {
12703 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
12704 +               __rt_spin_unlock(&rwlock->lock);
12705 +               migrate_enable();
12706 +       }
12707 +}
12708 +EXPORT_SYMBOL(rt_read_unlock);
12709 +
12710 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
12711 +{
12712 +       rt_write_lock(rwlock);
12713 +
12714 +       return 0;
12715 +}
12716 +EXPORT_SYMBOL(rt_write_lock_irqsave);
12717 +
12718 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
12719 +{
12720 +       rt_read_lock(rwlock);
12721 +
12722 +       return 0;
12723 +}
12724 +EXPORT_SYMBOL(rt_read_lock_irqsave);
12725 +
12726 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
12727 +{
12728 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12729 +       /*
12730 +        * Make sure we are not reinitializing a held lock:
12731 +        */
12732 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
12733 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
12734 +#endif
12735 +       rwlock->lock.save_state = 1;
12736 +       rwlock->read_depth = 0;
12737 +}
12738 +EXPORT_SYMBOL(__rt_rwlock_init);
12739 +
12740 +/*
12741 + * rw_semaphores
12742 + */
12743 +
12744 +void  rt_up_write(struct rw_semaphore *rwsem)
12745 +{
12746 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12747 +       rt_mutex_unlock(&rwsem->lock);
12748 +}
12749 +EXPORT_SYMBOL(rt_up_write);
12750 +
12751 +void __rt_up_read(struct rw_semaphore *rwsem)
12752 +{
12753 +       if (--rwsem->read_depth == 0)
12754 +               rt_mutex_unlock(&rwsem->lock);
12755 +}
12756 +
12757 +void  rt_up_read(struct rw_semaphore *rwsem)
12758 +{
12759 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12760 +       __rt_up_read(rwsem);
12761 +}
12762 +EXPORT_SYMBOL(rt_up_read);
12763 +
12764 +/*
12765 + * downgrade a write lock into a read lock
12766 + * - just wake up any readers at the front of the queue
12767 + */
12768 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
12769 +{
12770 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
12771 +       rwsem->read_depth = 1;
12772 +}
12773 +EXPORT_SYMBOL(rt_downgrade_write);
12774 +
12775 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
12776 +{
12777 +       int ret = rt_mutex_trylock(&rwsem->lock);
12778 +
12779 +       if (ret)
12780 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12781 +       return ret;
12782 +}
12783 +EXPORT_SYMBOL(rt_down_write_trylock);
12784 +
12785 +void  rt_down_write(struct rw_semaphore *rwsem)
12786 +{
12787 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12788 +       rt_mutex_lock(&rwsem->lock);
12789 +}
12790 +EXPORT_SYMBOL(rt_down_write);
12791 +
12792 +int rt_down_write_killable(struct rw_semaphore *rwsem)
12793 +{
12794 +       int ret;
12795 +
12796 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
12797 +       ret = rt_mutex_lock_killable(&rwsem->lock);
12798 +       if (ret)
12799 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12800 +       return ret;
12801 +}
12802 +EXPORT_SYMBOL(rt_down_write_killable);
12803 +
12804 +int rt_down_write_killable_nested(struct rw_semaphore *rwsem, int subclass)
12805 +{
12806 +       int ret;
12807 +
12808 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12809 +       ret = rt_mutex_lock_killable(&rwsem->lock);
12810 +       if (ret)
12811 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
12812 +       return ret;
12813 +}
12814 +EXPORT_SYMBOL(rt_down_write_killable_nested);
12815 +
12816 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
12817 +{
12818 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
12819 +       rt_mutex_lock(&rwsem->lock);
12820 +}
12821 +EXPORT_SYMBOL(rt_down_write_nested);
12822 +
12823 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
12824 +                              struct lockdep_map *nest)
12825 +{
12826 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
12827 +       rt_mutex_lock(&rwsem->lock);
12828 +}
12829 +EXPORT_SYMBOL(rt_down_write_nested_lock);
12830 +
12831 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
12832 +{
12833 +       struct rt_mutex *lock = &rwsem->lock;
12834 +       int ret = 1;
12835 +
12836 +       /*
12837 +        * recursive read locks succeed when current owns the rwsem,
12838 +        * but not when read_depth == 0 which means that the rwsem is
12839 +        * write locked.
12840 +        */
12841 +       if (rt_mutex_owner(lock) != current)
12842 +               ret = rt_mutex_trylock(&rwsem->lock);
12843 +       else if (!rwsem->read_depth)
12844 +               ret = 0;
12845 +
12846 +       if (ret)
12847 +               rwsem->read_depth++;
12848 +       return ret;
12849 +
12850 +}
12851 +
12852 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
12853 +{
12854 +       int ret;
12855 +
12856 +       ret = rt__down_read_trylock(rwsem);
12857 +       if (ret)
12858 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
12859 +
12860 +       return ret;
12861 +}
12862 +EXPORT_SYMBOL(rt_down_read_trylock);
12863 +
12864 +void rt__down_read(struct rw_semaphore *rwsem)
12865 +{
12866 +       struct rt_mutex *lock = &rwsem->lock;
12867 +
12868 +       if (rt_mutex_owner(lock) != current)
12869 +               rt_mutex_lock(&rwsem->lock);
12870 +       rwsem->read_depth++;
12871 +}
12872 +EXPORT_SYMBOL(rt__down_read);
12873 +
12874 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
12875 +{
12876 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
12877 +       rt__down_read(rwsem);
12878 +}
12879 +
12880 +void  rt_down_read(struct rw_semaphore *rwsem)
12881 +{
12882 +       __rt_down_read(rwsem, 0);
12883 +}
12884 +EXPORT_SYMBOL(rt_down_read);
12885 +
12886 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
12887 +{
12888 +       __rt_down_read(rwsem, subclass);
12889 +}
12890 +EXPORT_SYMBOL(rt_down_read_nested);
12891 +
12892 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
12893 +                             struct lock_class_key *key)
12894 +{
12895 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
12896 +       /*
12897 +        * Make sure we are not reinitializing a held lock:
12898 +        */
12899 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
12900 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
12901 +#endif
12902 +       rwsem->read_depth = 0;
12903 +       rwsem->lock.save_state = 0;
12904 +}
12905 +EXPORT_SYMBOL(__rt_rwsem_init);
12906 +
12907 +/**
12908 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
12909 + * @cnt: the atomic which we are to dec
12910 + * @lock: the mutex to return holding if we dec to 0
12911 + *
12912 + * return true and hold lock if we dec to 0, return false otherwise
12913 + */
12914 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
12915 +{
12916 +       /* dec if we can't possibly hit 0 */
12917 +       if (atomic_add_unless(cnt, -1, 1))
12918 +               return 0;
12919 +       /* we might hit 0, so take the lock */
12920 +       mutex_lock(lock);
12921 +       if (!atomic_dec_and_test(cnt)) {
12922 +               /* when we actually did the dec, we didn't hit 0 */
12923 +               mutex_unlock(lock);
12924 +               return 0;
12925 +       }
12926 +       /* we hit 0, and we hold the lock */
12927 +       return 1;
12928 +}
12929 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
12930 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
12931 index 2c49d76f96c3..4f1a7663c34d 100644
12932 --- a/kernel/locking/rtmutex.c
12933 +++ b/kernel/locking/rtmutex.c
12934 @@ -7,6 +7,11 @@
12935   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
12936   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
12937   *  Copyright (C) 2006 Esben Nielsen
12938 + *  Adaptive Spinlocks:
12939 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
12940 + *                                  and Peter Morreale,
12941 + * Adaptive Spinlocks simplification:
12942 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
12943   *
12944   *  See Documentation/locking/rt-mutex-design.txt for details.
12945   */
12946 @@ -16,6 +21,7 @@
12947  #include <linux/sched/rt.h>
12948  #include <linux/sched/deadline.h>
12949  #include <linux/timer.h>
12950 +#include <linux/ww_mutex.h>
12951
12952  #include "rtmutex_common.h"
12953
12954 @@ -133,6 +139,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
12955                 WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
12956  }
12957
12958 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
12959 +{
12960 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
12961 +               waiter != PI_REQUEUE_INPROGRESS;
12962 +}
12963 +
12964  /*
12965   * We can speed up the acquire/release, if there's no debugging state to be
12966   * set up.
12967 @@ -414,6 +426,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
12968         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
12969  }
12970
12971 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
12972 +{
12973 +       if (waiter->savestate)
12974 +               wake_up_lock_sleeper(waiter->task);
12975 +       else
12976 +               wake_up_process(waiter->task);
12977 +}
12978 +
12979  /*
12980   * Max number of times we'll walk the boosting chain:
12981   */
12982 @@ -421,7 +441,8 @@ int max_lock_depth = 1024;
12983
12984  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
12985  {
12986 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
12987 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
12988 +               p->pi_blocked_on->lock : NULL;
12989  }
12990
12991  /*
12992 @@ -557,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
12993          * reached or the state of the chain has changed while we
12994          * dropped the locks.
12995          */
12996 -       if (!waiter)
12997 +       if (!rt_mutex_real_waiter(waiter))
12998                 goto out_unlock_pi;
12999
13000         /*
13001 @@ -719,13 +740,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13002          * follow here. This is the end of the chain we are walking.
13003          */
13004         if (!rt_mutex_owner(lock)) {
13005 +               struct rt_mutex_waiter *lock_top_waiter;
13006 +
13007                 /*
13008                  * If the requeue [7] above changed the top waiter,
13009                  * then we need to wake the new top waiter up to try
13010                  * to get the lock.
13011                  */
13012 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
13013 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
13014 +               lock_top_waiter = rt_mutex_top_waiter(lock);
13015 +               if (prerequeue_top_waiter != lock_top_waiter)
13016 +                       rt_mutex_wake_waiter(lock_top_waiter);
13017                 raw_spin_unlock_irq(&lock->wait_lock);
13018                 return 0;
13019         }
13020 @@ -818,6 +842,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13021         return ret;
13022  }
13023
13024 +
13025 +#define STEAL_NORMAL  0
13026 +#define STEAL_LATERAL 1
13027 +
13028 +/*
13029 + * Note that RT tasks are excluded from lateral-steals to prevent the
13030 + * introduction of an unbounded latency
13031 + */
13032 +static inline int lock_is_stealable(struct task_struct *task,
13033 +                                   struct task_struct *pendowner, int mode)
13034 +{
13035 +    if (mode == STEAL_NORMAL || rt_task(task)) {
13036 +           if (task->prio >= pendowner->prio)
13037 +                   return 0;
13038 +    } else if (task->prio > pendowner->prio)
13039 +           return 0;
13040 +    return 1;
13041 +}
13042 +
13043  /*
13044   * Try to take an rt-mutex
13045   *
13046 @@ -828,8 +871,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
13047   * @waiter: The waiter that is queued to the lock's wait tree if the
13048   *         callsite called task_blocked_on_lock(), otherwise NULL
13049   */
13050 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13051 -                               struct rt_mutex_waiter *waiter)
13052 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
13053 +                                 struct task_struct *task,
13054 +                                 struct rt_mutex_waiter *waiter, int mode)
13055  {
13056         /*
13057          * Before testing whether we can acquire @lock, we set the
13058 @@ -866,8 +910,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13059                  * If waiter is not the highest priority waiter of
13060                  * @lock, give up.
13061                  */
13062 -               if (waiter != rt_mutex_top_waiter(lock))
13063 +               if (waiter != rt_mutex_top_waiter(lock)) {
13064 +                       /* XXX lock_is_stealable() ? */
13065                         return 0;
13066 +               }
13067
13068                 /*
13069                  * We can acquire the lock. Remove the waiter from the
13070 @@ -885,14 +931,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13071                  * not need to be dequeued.
13072                  */
13073                 if (rt_mutex_has_waiters(lock)) {
13074 -                       /*
13075 -                        * If @task->prio is greater than or equal to
13076 -                        * the top waiter priority (kernel view),
13077 -                        * @task lost.
13078 -                        */
13079 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
13080 -                               return 0;
13081 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
13082
13083 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
13084 +                               return 0;
13085                         /*
13086                          * The current top waiter stays enqueued. We
13087                          * don't have to change anything in the lock
13088 @@ -941,6 +983,433 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13089         return 1;
13090  }
13091
13092 +#ifdef CONFIG_PREEMPT_RT_FULL
13093 +/*
13094 + * preemptible spin_lock functions:
13095 + */
13096 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
13097 +                                        void  (*slowfn)(struct rt_mutex *lock,
13098 +                                                        bool mg_off),
13099 +                                        bool do_mig_dis)
13100 +{
13101 +       might_sleep_no_state_check();
13102 +
13103 +       if (do_mig_dis)
13104 +               migrate_disable();
13105 +
13106 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
13107 +               rt_mutex_deadlock_account_lock(lock, current);
13108 +       else
13109 +               slowfn(lock, do_mig_dis);
13110 +}
13111 +
13112 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
13113 +                                         int (*slowfn)(struct rt_mutex *lock))
13114 +{
13115 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13116 +               rt_mutex_deadlock_account_unlock(current);
13117 +               return 0;
13118 +       }
13119 +       return slowfn(lock);
13120 +}
13121 +#ifdef CONFIG_SMP
13122 +/*
13123 + * Note that owner is a speculative pointer and dereferencing relies
13124 + * on rcu_read_lock() and the check against the lock owner.
13125 + */
13126 +static int adaptive_wait(struct rt_mutex *lock,
13127 +                        struct task_struct *owner)
13128 +{
13129 +       int res = 0;
13130 +
13131 +       rcu_read_lock();
13132 +       for (;;) {
13133 +               if (owner != rt_mutex_owner(lock))
13134 +                       break;
13135 +               /*
13136 +                * Ensure that owner->on_cpu is dereferenced _after_
13137 +                * checking the above to be valid.
13138 +                */
13139 +               barrier();
13140 +               if (!owner->on_cpu) {
13141 +                       res = 1;
13142 +                       break;
13143 +               }
13144 +               cpu_relax();
13145 +       }
13146 +       rcu_read_unlock();
13147 +       return res;
13148 +}
13149 +#else
13150 +static int adaptive_wait(struct rt_mutex *lock,
13151 +                        struct task_struct *orig_owner)
13152 +{
13153 +       return 1;
13154 +}
13155 +#endif
13156 +
13157 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13158 +                                  struct rt_mutex_waiter *waiter,
13159 +                                  struct task_struct *task,
13160 +                                  enum rtmutex_chainwalk chwalk);
13161 +/*
13162 + * Slow path lock function spin_lock style: this variant is very
13163 + * careful not to miss any non-lock wakeups.
13164 + *
13165 + * We store the current state under p->pi_lock in p->saved_state and
13166 + * the try_to_wake_up() code handles this accordingly.
13167 + */
13168 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
13169 +                                                   bool mg_off)
13170 +{
13171 +       struct task_struct *lock_owner, *self = current;
13172 +       struct rt_mutex_waiter waiter, *top_waiter;
13173 +       unsigned long flags;
13174 +       int ret;
13175 +
13176 +       rt_mutex_init_waiter(&waiter, true);
13177 +
13178 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
13179 +
13180 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
13181 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13182 +               return;
13183 +       }
13184 +
13185 +       BUG_ON(rt_mutex_owner(lock) == self);
13186 +
13187 +       /*
13188 +        * We save whatever state the task is in and we'll restore it
13189 +        * after acquiring the lock taking real wakeups into account
13190 +        * as well. We are serialized via pi_lock against wakeups. See
13191 +        * try_to_wake_up().
13192 +        */
13193 +       raw_spin_lock(&self->pi_lock);
13194 +       self->saved_state = self->state;
13195 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
13196 +       raw_spin_unlock(&self->pi_lock);
13197 +
13198 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
13199 +       BUG_ON(ret);
13200 +
13201 +       for (;;) {
13202 +               /* Try to acquire the lock again. */
13203 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
13204 +                       break;
13205 +
13206 +               top_waiter = rt_mutex_top_waiter(lock);
13207 +               lock_owner = rt_mutex_owner(lock);
13208 +
13209 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13210 +
13211 +               debug_rt_mutex_print_deadlock(&waiter);
13212 +
13213 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
13214 +                       if (mg_off)
13215 +                               migrate_enable();
13216 +                       schedule();
13217 +                       if (mg_off)
13218 +                               migrate_disable();
13219 +               }
13220 +
13221 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
13222 +
13223 +               raw_spin_lock(&self->pi_lock);
13224 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
13225 +               raw_spin_unlock(&self->pi_lock);
13226 +       }
13227 +
13228 +       /*
13229 +        * Restore the task state to current->saved_state. We set it
13230 +        * to the original state above and the try_to_wake_up() code
13231 +        * has possibly updated it when a real (non-rtmutex) wakeup
13232 +        * happened while we were blocked. Clear saved_state so
13233 +        * try_to_wakeup() does not get confused.
13234 +        */
13235 +       raw_spin_lock(&self->pi_lock);
13236 +       __set_current_state_no_track(self->saved_state);
13237 +       self->saved_state = TASK_RUNNING;
13238 +       raw_spin_unlock(&self->pi_lock);
13239 +
13240 +       /*
13241 +        * try_to_take_rt_mutex() sets the waiter bit
13242 +        * unconditionally. We might have to fix that up:
13243 +        */
13244 +       fixup_rt_mutex_waiters(lock);
13245 +
13246 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
13247 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
13248 +
13249 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13250 +
13251 +       debug_rt_mutex_free_waiter(&waiter);
13252 +}
13253 +
13254 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
13255 +                                   struct wake_q_head *wake_sleeper_q,
13256 +                                   struct rt_mutex *lock);
13257 +/*
13258 + * Slow path to release a rt_mutex spin_lock style
13259 + */
13260 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
13261 +{
13262 +       unsigned long flags;
13263 +       WAKE_Q(wake_q);
13264 +       WAKE_Q(wake_sleeper_q);
13265 +
13266 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
13267 +
13268 +       debug_rt_mutex_unlock(lock);
13269 +
13270 +       rt_mutex_deadlock_account_unlock(current);
13271 +
13272 +       if (!rt_mutex_has_waiters(lock)) {
13273 +               lock->owner = NULL;
13274 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13275 +               return 0;
13276 +       }
13277 +
13278 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
13279 +
13280 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13281 +       wake_up_q(&wake_q);
13282 +       wake_up_q_sleeper(&wake_sleeper_q);
13283 +
13284 +       /* Undo pi boosting.when necessary */
13285 +       rt_mutex_adjust_prio(current);
13286 +       return 0;
13287 +}
13288 +
13289 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
13290 +{
13291 +       unsigned long flags;
13292 +       WAKE_Q(wake_q);
13293 +       WAKE_Q(wake_sleeper_q);
13294 +
13295 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
13296 +
13297 +       debug_rt_mutex_unlock(lock);
13298 +
13299 +       rt_mutex_deadlock_account_unlock(current);
13300 +
13301 +       if (!rt_mutex_has_waiters(lock)) {
13302 +               lock->owner = NULL;
13303 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13304 +               return 0;
13305 +       }
13306 +
13307 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
13308 +
13309 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13310 +       wake_up_q(&wake_q);
13311 +       wake_up_q_sleeper(&wake_sleeper_q);
13312 +       return 1;
13313 +}
13314 +
13315 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
13316 +{
13317 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
13318 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13319 +}
13320 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
13321 +
13322 +void __lockfunc rt_spin_lock(spinlock_t *lock)
13323 +{
13324 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
13325 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13326 +}
13327 +EXPORT_SYMBOL(rt_spin_lock);
13328 +
13329 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
13330 +{
13331 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
13332 +}
13333 +EXPORT_SYMBOL(__rt_spin_lock);
13334 +
13335 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
13336 +{
13337 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
13338 +}
13339 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
13340 +
13341 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13342 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
13343 +{
13344 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
13345 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
13346 +}
13347 +EXPORT_SYMBOL(rt_spin_lock_nested);
13348 +#endif
13349 +
13350 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
13351 +{
13352 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13353 +       spin_release(&lock->dep_map, 1, _RET_IP_);
13354 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
13355 +}
13356 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
13357 +
13358 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
13359 +{
13360 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13361 +       spin_release(&lock->dep_map, 1, _RET_IP_);
13362 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
13363 +       migrate_enable();
13364 +}
13365 +EXPORT_SYMBOL(rt_spin_unlock);
13366 +
13367 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
13368 +{
13369 +       int ret;
13370 +
13371 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13372 +       spin_release(&lock->dep_map, 1, _RET_IP_);
13373 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
13374 +       migrate_enable();
13375 +       return ret;
13376 +}
13377 +
13378 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
13379 +{
13380 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
13381 +}
13382 +EXPORT_SYMBOL(__rt_spin_unlock);
13383 +
13384 +/*
13385 + * Wait for the lock to get unlocked: instead of polling for an unlock
13386 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
13387 + * schedule if there's contention:
13388 + */
13389 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
13390 +{
13391 +       spin_lock(lock);
13392 +       spin_unlock(lock);
13393 +}
13394 +EXPORT_SYMBOL(rt_spin_unlock_wait);
13395 +
13396 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
13397 +{
13398 +       int ret;
13399 +
13400 +       ret = rt_mutex_trylock(&lock->lock);
13401 +       if (ret)
13402 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13403 +       return ret;
13404 +}
13405 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
13406 +
13407 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
13408 +{
13409 +       int ret;
13410 +
13411 +       migrate_disable();
13412 +       ret = rt_mutex_trylock(&lock->lock);
13413 +       if (ret)
13414 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13415 +       else
13416 +               migrate_enable();
13417 +       return ret;
13418 +}
13419 +EXPORT_SYMBOL(rt_spin_trylock);
13420 +
13421 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
13422 +{
13423 +       int ret;
13424 +
13425 +       local_bh_disable();
13426 +       ret = rt_mutex_trylock(&lock->lock);
13427 +       if (ret) {
13428 +               migrate_disable();
13429 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13430 +       } else
13431 +               local_bh_enable();
13432 +       return ret;
13433 +}
13434 +EXPORT_SYMBOL(rt_spin_trylock_bh);
13435 +
13436 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
13437 +{
13438 +       int ret;
13439 +
13440 +       *flags = 0;
13441 +       ret = rt_mutex_trylock(&lock->lock);
13442 +       if (ret) {
13443 +               migrate_disable();
13444 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13445 +       }
13446 +       return ret;
13447 +}
13448 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
13449 +
13450 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
13451 +{
13452 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
13453 +       if (atomic_add_unless(atomic, -1, 1))
13454 +               return 0;
13455 +       rt_spin_lock(lock);
13456 +       if (atomic_dec_and_test(atomic))
13457 +               return 1;
13458 +       rt_spin_unlock(lock);
13459 +       return 0;
13460 +}
13461 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
13462 +
13463 +       void
13464 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
13465 +{
13466 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13467 +       /*
13468 +        * Make sure we are not reinitializing a held lock:
13469 +        */
13470 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
13471 +       lockdep_init_map(&lock->dep_map, name, key, 0);
13472 +#endif
13473 +}
13474 +EXPORT_SYMBOL(__rt_spin_lock_init);
13475 +
13476 +#endif /* PREEMPT_RT_FULL */
13477 +
13478 +#ifdef CONFIG_PREEMPT_RT_FULL
13479 +       static inline int __sched
13480 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
13481 +{
13482 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
13483 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
13484 +
13485 +       if (!hold_ctx)
13486 +               return 0;
13487 +
13488 +       if (unlikely(ctx == hold_ctx))
13489 +               return -EALREADY;
13490 +
13491 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
13492 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
13493 +#ifdef CONFIG_DEBUG_MUTEXES
13494 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
13495 +               ctx->contending_lock = ww;
13496 +#endif
13497 +               return -EDEADLK;
13498 +       }
13499 +
13500 +       return 0;
13501 +}
13502 +#else
13503 +       static inline int __sched
13504 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
13505 +{
13506 +       BUG();
13507 +       return 0;
13508 +}
13509 +
13510 +#endif
13511 +
13512 +static inline int
13513 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
13514 +                    struct rt_mutex_waiter *waiter)
13515 +{
13516 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
13517 +}
13518 +
13519  /*
13520   * Task blocks on lock.
13521   *
13522 @@ -971,6 +1440,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13523                 return -EDEADLK;
13524
13525         raw_spin_lock(&task->pi_lock);
13526 +
13527 +       /*
13528 +        * In the case of futex requeue PI, this will be a proxy
13529 +        * lock. The task will wake unaware that it is enqueueed on
13530 +        * this lock. Avoid blocking on two locks and corrupting
13531 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
13532 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
13533 +        * before requeue (due to a signal or timeout). Do not enqueue
13534 +        * the task if PI_WAKEUP_INPROGRESS is set.
13535 +        */
13536 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
13537 +               raw_spin_unlock(&task->pi_lock);
13538 +               return -EAGAIN;
13539 +       }
13540 +
13541 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
13542 +
13543         __rt_mutex_adjust_prio(task);
13544         waiter->task = task;
13545         waiter->lock = lock;
13546 @@ -994,7 +1480,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13547                 rt_mutex_enqueue_pi(owner, waiter);
13548
13549                 __rt_mutex_adjust_prio(owner);
13550 -               if (owner->pi_blocked_on)
13551 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
13552                         chain_walk = 1;
13553         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
13554                 chain_walk = 1;
13555 @@ -1036,6 +1522,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
13556   * Called with lock->wait_lock held and interrupts disabled.
13557   */
13558  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
13559 +                                   struct wake_q_head *wake_sleeper_q,
13560                                     struct rt_mutex *lock)
13561  {
13562         struct rt_mutex_waiter *waiter;
13563 @@ -1064,7 +1551,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
13564
13565         raw_spin_unlock(&current->pi_lock);
13566
13567 -       wake_q_add(wake_q, waiter->task);
13568 +       if (waiter->savestate)
13569 +               wake_q_add(wake_sleeper_q, waiter->task);
13570 +       else
13571 +               wake_q_add(wake_q, waiter->task);
13572  }
13573
13574  /*
13575 @@ -1078,7 +1568,7 @@ static void remove_waiter(struct rt_mutex *lock,
13576  {
13577         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
13578         struct task_struct *owner = rt_mutex_owner(lock);
13579 -       struct rt_mutex *next_lock;
13580 +       struct rt_mutex *next_lock = NULL;
13581
13582         raw_spin_lock(&current->pi_lock);
13583         rt_mutex_dequeue(lock, waiter);
13584 @@ -1102,7 +1592,8 @@ static void remove_waiter(struct rt_mutex *lock,
13585         __rt_mutex_adjust_prio(owner);
13586
13587         /* Store the lock on which owner is blocked or NULL */
13588 -       next_lock = task_blocked_on_lock(owner);
13589 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
13590 +               next_lock = task_blocked_on_lock(owner);
13591
13592         raw_spin_unlock(&owner->pi_lock);
13593
13594 @@ -1138,17 +1629,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
13595         raw_spin_lock_irqsave(&task->pi_lock, flags);
13596
13597         waiter = task->pi_blocked_on;
13598 -       if (!waiter || (waiter->prio == task->prio &&
13599 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
13600                         !dl_prio(task->prio))) {
13601                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13602                 return;
13603         }
13604         next_lock = waiter->lock;
13605 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13606
13607         /* gets dropped in rt_mutex_adjust_prio_chain()! */
13608         get_task_struct(task);
13609
13610 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
13611         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
13612                                    next_lock, NULL, task);
13613  }
13614 @@ -1166,7 +1657,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
13615  static int __sched
13616  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
13617                     struct hrtimer_sleeper *timeout,
13618 -                   struct rt_mutex_waiter *waiter)
13619 +                   struct rt_mutex_waiter *waiter,
13620 +                   struct ww_acquire_ctx *ww_ctx)
13621  {
13622         int ret = 0;
13623
13624 @@ -1189,6 +1681,12 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
13625                                 break;
13626                 }
13627
13628 +               if (ww_ctx && ww_ctx->acquired > 0) {
13629 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
13630 +                       if (ret)
13631 +                               break;
13632 +               }
13633 +
13634                 raw_spin_unlock_irq(&lock->wait_lock);
13635
13636                 debug_rt_mutex_print_deadlock(waiter);
13637 @@ -1223,21 +1721,96 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
13638         }
13639  }
13640
13641 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
13642 +                                                  struct ww_acquire_ctx *ww_ctx)
13643 +{
13644 +#ifdef CONFIG_DEBUG_MUTEXES
13645 +       /*
13646 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
13647 +        * but released with a normal mutex_unlock in this call.
13648 +        *
13649 +        * This should never happen, always use ww_mutex_unlock.
13650 +        */
13651 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
13652 +
13653 +       /*
13654 +        * Not quite done after calling ww_acquire_done() ?
13655 +        */
13656 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
13657 +
13658 +       if (ww_ctx->contending_lock) {
13659 +               /*
13660 +                * After -EDEADLK you tried to
13661 +                * acquire a different ww_mutex? Bad!
13662 +                */
13663 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
13664 +
13665 +               /*
13666 +                * You called ww_mutex_lock after receiving -EDEADLK,
13667 +                * but 'forgot' to unlock everything else first?
13668 +                */
13669 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
13670 +               ww_ctx->contending_lock = NULL;
13671 +       }
13672 +
13673 +       /*
13674 +        * Naughty, using a different class will lead to undefined behavior!
13675 +        */
13676 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
13677 +#endif
13678 +       ww_ctx->acquired++;
13679 +}
13680 +
13681 +#ifdef CONFIG_PREEMPT_RT_FULL
13682 +static void ww_mutex_account_lock(struct rt_mutex *lock,
13683 +                                 struct ww_acquire_ctx *ww_ctx)
13684 +{
13685 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
13686 +       struct rt_mutex_waiter *waiter, *n;
13687 +
13688 +       /*
13689 +        * This branch gets optimized out for the common case,
13690 +        * and is only important for ww_mutex_lock.
13691 +        */
13692 +       ww_mutex_lock_acquired(ww, ww_ctx);
13693 +       ww->ctx = ww_ctx;
13694 +
13695 +       /*
13696 +        * Give any possible sleeping processes the chance to wake up,
13697 +        * so they can recheck if they have to back off.
13698 +        */
13699 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
13700 +                                            tree_entry) {
13701 +               /* XXX debug rt mutex waiter wakeup */
13702 +
13703 +               BUG_ON(waiter->lock != lock);
13704 +               rt_mutex_wake_waiter(waiter);
13705 +       }
13706 +}
13707 +
13708 +#else
13709 +
13710 +static void ww_mutex_account_lock(struct rt_mutex *lock,
13711 +                                 struct ww_acquire_ctx *ww_ctx)
13712 +{
13713 +       BUG();
13714 +}
13715 +#endif
13716 +
13717  /*
13718   * Slow path lock function:
13719   */
13720  static int __sched
13721  rt_mutex_slowlock(struct rt_mutex *lock, int state,
13722                   struct hrtimer_sleeper *timeout,
13723 -                 enum rtmutex_chainwalk chwalk)
13724 +                 enum rtmutex_chainwalk chwalk,
13725 +                 struct ww_acquire_ctx *ww_ctx)
13726  {
13727         struct rt_mutex_waiter waiter;
13728         unsigned long flags;
13729         int ret = 0;
13730
13731 -       debug_rt_mutex_init_waiter(&waiter);
13732 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
13733 -       RB_CLEAR_NODE(&waiter.tree_entry);
13734 +       rt_mutex_init_waiter(&waiter, false);
13735
13736         /*
13737          * Technically we could use raw_spin_[un]lock_irq() here, but this can
13738 @@ -1251,6 +1824,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
13739
13740         /* Try to acquire the lock again: */
13741         if (try_to_take_rt_mutex(lock, current, NULL)) {
13742 +               if (ww_ctx)
13743 +                       ww_mutex_account_lock(lock, ww_ctx);
13744                 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13745                 return 0;
13746         }
13747 @@ -1265,13 +1840,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
13748
13749         if (likely(!ret))
13750                 /* sleep on the mutex */
13751 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
13752 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
13753 +                                         ww_ctx);
13754 +       else if (ww_ctx) {
13755 +               /* ww_mutex received EDEADLK, let it become EALREADY */
13756 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
13757 +               BUG_ON(!ret);
13758 +       }
13759
13760         if (unlikely(ret)) {
13761                 __set_current_state(TASK_RUNNING);
13762                 if (rt_mutex_has_waiters(lock))
13763                         remove_waiter(lock, &waiter);
13764 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13765 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
13766 +               if (!ww_ctx)
13767 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
13768 +       } else if (ww_ctx) {
13769 +               ww_mutex_account_lock(lock, ww_ctx);
13770         }
13771
13772         /*
13773 @@ -1331,7 +1916,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
13774   * Return whether the current task needs to undo a potential priority boosting.
13775   */
13776  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13777 -                                       struct wake_q_head *wake_q)
13778 +                                       struct wake_q_head *wake_q,
13779 +                                       struct wake_q_head *wake_sleeper_q)
13780  {
13781         unsigned long flags;
13782
13783 @@ -1387,7 +1973,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13784          *
13785          * Queue the next waiter for wakeup once we release the wait_lock.
13786          */
13787 -       mark_wakeup_next_waiter(wake_q, lock);
13788 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
13789
13790         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
13791
13792 @@ -1403,31 +1989,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
13793   */
13794  static inline int
13795  rt_mutex_fastlock(struct rt_mutex *lock, int state,
13796 +                 struct ww_acquire_ctx *ww_ctx,
13797                   int (*slowfn)(struct rt_mutex *lock, int state,
13798                                 struct hrtimer_sleeper *timeout,
13799 -                               enum rtmutex_chainwalk chwalk))
13800 +                               enum rtmutex_chainwalk chwalk,
13801 +                               struct ww_acquire_ctx *ww_ctx))
13802  {
13803         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13804                 rt_mutex_deadlock_account_lock(lock, current);
13805                 return 0;
13806         } else
13807 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
13808 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
13809 +                             ww_ctx);
13810  }
13811
13812  static inline int
13813  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
13814                         struct hrtimer_sleeper *timeout,
13815                         enum rtmutex_chainwalk chwalk,
13816 +                       struct ww_acquire_ctx *ww_ctx,
13817                         int (*slowfn)(struct rt_mutex *lock, int state,
13818                                       struct hrtimer_sleeper *timeout,
13819 -                                     enum rtmutex_chainwalk chwalk))
13820 +                                     enum rtmutex_chainwalk chwalk,
13821 +                                     struct ww_acquire_ctx *ww_ctx))
13822  {
13823         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
13824             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
13825                 rt_mutex_deadlock_account_lock(lock, current);
13826                 return 0;
13827         } else
13828 -               return slowfn(lock, state, timeout, chwalk);
13829 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
13830  }
13831
13832  static inline int
13833 @@ -1444,17 +2035,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
13834  static inline void
13835  rt_mutex_fastunlock(struct rt_mutex *lock,
13836                     bool (*slowfn)(struct rt_mutex *lock,
13837 -                                  struct wake_q_head *wqh))
13838 +                                  struct wake_q_head *wqh,
13839 +                                  struct wake_q_head *wq_sleeper))
13840  {
13841         WAKE_Q(wake_q);
13842 +       WAKE_Q(wake_sleeper_q);
13843
13844         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13845                 rt_mutex_deadlock_account_unlock(current);
13846
13847         } else {
13848 -               bool deboost = slowfn(lock, &wake_q);
13849 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
13850
13851                 wake_up_q(&wake_q);
13852 +               wake_up_q_sleeper(&wake_sleeper_q);
13853
13854                 /* Undo pi boosting if necessary: */
13855                 if (deboost)
13856 @@ -1471,7 +2065,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
13857  {
13858         might_sleep();
13859
13860 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
13861 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
13862  }
13863  EXPORT_SYMBOL_GPL(rt_mutex_lock);
13864
13865 @@ -1488,7 +2082,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
13866  {
13867         might_sleep();
13868
13869 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
13870 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
13871  }
13872  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
13873
13874 @@ -1501,11 +2095,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
13875         might_sleep();
13876
13877         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13878 -                                      RT_MUTEX_FULL_CHAINWALK,
13879 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
13880                                        rt_mutex_slowlock);
13881  }
13882
13883  /**
13884 + * rt_mutex_lock_killable - lock a rt_mutex killable
13885 + *
13886 + * @lock:              the rt_mutex to be locked
13887 + * @detect_deadlock:   deadlock detection on/off
13888 + *
13889 + * Returns:
13890 + *  0          on success
13891 + * -EINTR      when interrupted by a signal
13892 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
13893 + */
13894 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
13895 +{
13896 +       might_sleep();
13897 +
13898 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
13899 +}
13900 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
13901 +
13902 +/**
13903   * rt_mutex_timed_lock - lock a rt_mutex interruptible
13904   *                     the timeout structure is provided
13905   *                     by the caller
13906 @@ -1525,6 +2138,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
13907
13908         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
13909                                        RT_MUTEX_MIN_CHAINWALK,
13910 +                                      NULL,
13911                                        rt_mutex_slowlock);
13912  }
13913  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
13914 @@ -1542,7 +2156,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
13915   */
13916  int __sched rt_mutex_trylock(struct rt_mutex *lock)
13917  {
13918 +#ifdef CONFIG_PREEMPT_RT_FULL
13919 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
13920 +#else
13921         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
13922 +#endif
13923                 return 0;
13924
13925         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
13926 @@ -1568,13 +2186,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
13927   * required or not.
13928   */
13929  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
13930 -                                  struct wake_q_head *wqh)
13931 +                                  struct wake_q_head *wqh,
13932 +                                  struct wake_q_head *wq_sleeper)
13933  {
13934         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
13935                 rt_mutex_deadlock_account_unlock(current);
13936                 return false;
13937         }
13938 -       return rt_mutex_slowunlock(lock, wqh);
13939 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
13940  }
13941
13942  /**
13943 @@ -1607,13 +2226,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
13944  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
13945  {
13946         lock->owner = NULL;
13947 -       raw_spin_lock_init(&lock->wait_lock);
13948         lock->waiters = RB_ROOT;
13949         lock->waiters_leftmost = NULL;
13950
13951         debug_rt_mutex_init(lock, name);
13952  }
13953 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
13954 +EXPORT_SYMBOL(__rt_mutex_init);
13955
13956  /**
13957   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
13958 @@ -1628,7 +2246,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
13959  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
13960                                 struct task_struct *proxy_owner)
13961  {
13962 -       __rt_mutex_init(lock, NULL);
13963 +       rt_mutex_init(lock);
13964         debug_rt_mutex_proxy_lock(lock, proxy_owner);
13965         rt_mutex_set_owner(lock, proxy_owner);
13966         rt_mutex_deadlock_account_lock(lock, proxy_owner);
13967 @@ -1676,6 +2294,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
13968                 return 1;
13969         }
13970
13971 +#ifdef CONFIG_PREEMPT_RT_FULL
13972 +       /*
13973 +        * In PREEMPT_RT there's an added race.
13974 +        * If the task, that we are about to requeue, times out,
13975 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
13976 +        * to skip this task. But right after the task sets
13977 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
13978 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
13979 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
13980 +        * lock that it blocks on. We *must not* place this task
13981 +        * on this proxy lock in that case.
13982 +        *
13983 +        * To prevent this race, we first take the task's pi_lock
13984 +        * and check if it has updated its pi_blocked_on. If it has,
13985 +        * we assume that it woke up and we return -EAGAIN.
13986 +        * Otherwise, we set the task's pi_blocked_on to
13987 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
13988 +        * it will know that we are in the process of requeuing it.
13989 +        */
13990 +       raw_spin_lock(&task->pi_lock);
13991 +       if (task->pi_blocked_on) {
13992 +               raw_spin_unlock(&task->pi_lock);
13993 +               raw_spin_unlock_irq(&lock->wait_lock);
13994 +               return -EAGAIN;
13995 +       }
13996 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
13997 +       raw_spin_unlock(&task->pi_lock);
13998 +#endif
13999 +
14000         /* We enforce deadlock detection for futexes */
14001         ret = task_blocks_on_rt_mutex(lock, waiter, task,
14002                                       RT_MUTEX_FULL_CHAINWALK);
14003 @@ -1690,7 +2337,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
14004                 ret = 0;
14005         }
14006
14007 -       if (unlikely(ret))
14008 +       if (ret && rt_mutex_has_waiters(lock))
14009                 remove_waiter(lock, waiter);
14010
14011         raw_spin_unlock_irq(&lock->wait_lock);
14012 @@ -1746,7 +2393,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
14013         set_current_state(TASK_INTERRUPTIBLE);
14014
14015         /* sleep on the mutex */
14016 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
14017 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
14018
14019         if (unlikely(ret))
14020                 remove_waiter(lock, waiter);
14021 @@ -1761,3 +2408,89 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
14022
14023         return ret;
14024  }
14025 +
14026 +static inline int
14027 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
14028 +{
14029 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
14030 +       unsigned tmp;
14031 +
14032 +       if (ctx->deadlock_inject_countdown-- == 0) {
14033 +               tmp = ctx->deadlock_inject_interval;
14034 +               if (tmp > UINT_MAX/4)
14035 +                       tmp = UINT_MAX;
14036 +               else
14037 +                       tmp = tmp*2 + tmp + tmp/2;
14038 +
14039 +               ctx->deadlock_inject_interval = tmp;
14040 +               ctx->deadlock_inject_countdown = tmp;
14041 +               ctx->contending_lock = lock;
14042 +
14043 +               ww_mutex_unlock(lock);
14044 +
14045 +               return -EDEADLK;
14046 +       }
14047 +#endif
14048 +
14049 +       return 0;
14050 +}
14051 +
14052 +#ifdef CONFIG_PREEMPT_RT_FULL
14053 +int __sched
14054 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
14055 +{
14056 +       int ret;
14057 +
14058 +       might_sleep();
14059 +
14060 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
14061 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
14062 +       if (ret)
14063 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
14064 +       else if (!ret && ww_ctx->acquired > 1)
14065 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
14066 +
14067 +       return ret;
14068 +}
14069 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
14070 +
14071 +int __sched
14072 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
14073 +{
14074 +       int ret;
14075 +
14076 +       might_sleep();
14077 +
14078 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
14079 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
14080 +       if (ret)
14081 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
14082 +       else if (!ret && ww_ctx->acquired > 1)
14083 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
14084 +
14085 +       return ret;
14086 +}
14087 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
14088 +
14089 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
14090 +{
14091 +       int nest = !!lock->ctx;
14092 +
14093 +       /*
14094 +        * The unlocking fastpath is the 0->1 transition from 'locked'
14095 +        * into 'unlocked' state:
14096 +        */
14097 +       if (nest) {
14098 +#ifdef CONFIG_DEBUG_MUTEXES
14099 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
14100 +#endif
14101 +               if (lock->ctx->acquired > 0)
14102 +                       lock->ctx->acquired--;
14103 +               lock->ctx = NULL;
14104 +       }
14105 +
14106 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
14107 +       rt_mutex_unlock(&lock->base.lock);
14108 +}
14109 +EXPORT_SYMBOL(ww_mutex_unlock);
14110 +#endif
14111 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
14112 index e317e1cbb3eb..f457c7574920 100644
14113 --- a/kernel/locking/rtmutex_common.h
14114 +++ b/kernel/locking/rtmutex_common.h
14115 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
14116         struct rb_node          pi_tree_entry;
14117         struct task_struct      *task;
14118         struct rt_mutex         *lock;
14119 +       bool                    savestate;
14120  #ifdef CONFIG_DEBUG_RT_MUTEXES
14121         unsigned long           ip;
14122         struct pid              *deadlock_task_pid;
14123 @@ -98,6 +99,9 @@ enum rtmutex_chainwalk {
14124  /*
14125   * PI-futex support (proxy locking functions, etc.):
14126   */
14127 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
14128 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
14129 +
14130  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
14131  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
14132                                        struct task_struct *proxy_owner);
14133 @@ -111,7 +115,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
14134                                       struct rt_mutex_waiter *waiter);
14135  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
14136  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
14137 -                                 struct wake_q_head *wqh);
14138 +                                 struct wake_q_head *wqh,
14139 +                                 struct wake_q_head *wq_sleeper);
14140  extern void rt_mutex_adjust_prio(struct task_struct *task);
14141
14142  #ifdef CONFIG_DEBUG_RT_MUTEXES
14143 @@ -120,4 +125,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
14144  # include "rtmutex.h"
14145  #endif
14146
14147 +static inline void
14148 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
14149 +{
14150 +       debug_rt_mutex_init_waiter(waiter);
14151 +       waiter->task = NULL;
14152 +       waiter->savestate = savestate;
14153 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
14154 +       RB_CLEAR_NODE(&waiter->tree_entry);
14155 +}
14156 +
14157  #endif
14158 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
14159 index db3ccb1dd614..909779647bd1 100644
14160 --- a/kernel/locking/spinlock.c
14161 +++ b/kernel/locking/spinlock.c
14162 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
14163   *         __[spin|read|write]_lock_bh()
14164   */
14165  BUILD_LOCK_OPS(spin, raw_spinlock);
14166 +
14167 +#ifndef CONFIG_PREEMPT_RT_FULL
14168  BUILD_LOCK_OPS(read, rwlock);
14169  BUILD_LOCK_OPS(write, rwlock);
14170 +#endif
14171
14172  #endif
14173
14174 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
14175  EXPORT_SYMBOL(_raw_spin_unlock_bh);
14176  #endif
14177
14178 +#ifndef CONFIG_PREEMPT_RT_FULL
14179 +
14180  #ifndef CONFIG_INLINE_READ_TRYLOCK
14181  int __lockfunc _raw_read_trylock(rwlock_t *lock)
14182  {
14183 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
14184  EXPORT_SYMBOL(_raw_write_unlock_bh);
14185  #endif
14186
14187 +#endif /* !PREEMPT_RT_FULL */
14188 +
14189  #ifdef CONFIG_DEBUG_LOCK_ALLOC
14190
14191  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
14192 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
14193 index 0374a596cffa..94970338d518 100644
14194 --- a/kernel/locking/spinlock_debug.c
14195 +++ b/kernel/locking/spinlock_debug.c
14196 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
14197
14198  EXPORT_SYMBOL(__raw_spin_lock_init);
14199
14200 +#ifndef CONFIG_PREEMPT_RT_FULL
14201  void __rwlock_init(rwlock_t *lock, const char *name,
14202                    struct lock_class_key *key)
14203  {
14204 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
14205  }
14206
14207  EXPORT_SYMBOL(__rwlock_init);
14208 +#endif
14209
14210  static void spin_dump(raw_spinlock_t *lock, const char *msg)
14211  {
14212 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
14213         arch_spin_unlock(&lock->raw_lock);
14214  }
14215
14216 +#ifndef CONFIG_PREEMPT_RT_FULL
14217  static void rwlock_bug(rwlock_t *lock, const char *msg)
14218  {
14219         if (!debug_locks_off())
14220 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
14221         debug_write_unlock(lock);
14222         arch_write_unlock(&lock->raw_lock);
14223  }
14224 +
14225 +#endif
14226 diff --git a/kernel/panic.c b/kernel/panic.c
14227 index e6480e20379e..7e9c1918a94e 100644
14228 --- a/kernel/panic.c
14229 +++ b/kernel/panic.c
14230 @@ -482,9 +482,11 @@ static u64 oops_id;
14231
14232  static int init_oops_id(void)
14233  {
14234 +#ifndef CONFIG_PREEMPT_RT_FULL
14235         if (!oops_id)
14236                 get_random_bytes(&oops_id, sizeof(oops_id));
14237         else
14238 +#endif
14239                 oops_id++;
14240
14241         return 0;
14242 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
14243 index b26dbc48c75b..968255f27a33 100644
14244 --- a/kernel/power/hibernate.c
14245 +++ b/kernel/power/hibernate.c
14246 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
14247
14248         local_irq_disable();
14249
14250 +       system_state = SYSTEM_SUSPEND;
14251 +
14252         error = syscore_suspend();
14253         if (error) {
14254                 printk(KERN_ERR "PM: Some system devices failed to power down, "
14255 @@ -317,6 +319,7 @@ static int create_image(int platform_mode)
14256         syscore_resume();
14257
14258   Enable_irqs:
14259 +       system_state = SYSTEM_RUNNING;
14260         local_irq_enable();
14261
14262   Enable_cpus:
14263 @@ -446,6 +449,7 @@ static int resume_target_kernel(bool platform_mode)
14264                 goto Enable_cpus;
14265
14266         local_irq_disable();
14267 +       system_state = SYSTEM_SUSPEND;
14268
14269         error = syscore_suspend();
14270         if (error)
14271 @@ -479,6 +483,7 @@ static int resume_target_kernel(bool platform_mode)
14272         syscore_resume();
14273
14274   Enable_irqs:
14275 +       system_state = SYSTEM_RUNNING;
14276         local_irq_enable();
14277
14278   Enable_cpus:
14279 @@ -564,6 +569,7 @@ int hibernation_platform_enter(void)
14280                 goto Enable_cpus;
14281
14282         local_irq_disable();
14283 +       system_state = SYSTEM_SUSPEND;
14284         syscore_suspend();
14285         if (pm_wakeup_pending()) {
14286                 error = -EAGAIN;
14287 @@ -576,6 +582,7 @@ int hibernation_platform_enter(void)
14288
14289   Power_up:
14290         syscore_resume();
14291 +       system_state = SYSTEM_RUNNING;
14292         local_irq_enable();
14293
14294   Enable_cpus:
14295 @@ -676,6 +683,10 @@ static int load_image_and_restore(void)
14296         return error;
14297  }
14298
14299 +#ifndef CONFIG_SUSPEND
14300 +bool pm_in_action;
14301 +#endif
14302 +
14303  /**
14304   * hibernate - Carry out system hibernation, including saving the image.
14305   */
14306 @@ -689,6 +700,8 @@ int hibernate(void)
14307                 return -EPERM;
14308         }
14309
14310 +       pm_in_action = true;
14311 +
14312         lock_system_sleep();
14313         /* The snapshot device should not be opened while we're running */
14314         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
14315 @@ -766,6 +779,7 @@ int hibernate(void)
14316         atomic_inc(&snapshot_device_available);
14317   Unlock:
14318         unlock_system_sleep();
14319 +       pm_in_action = false;
14320         return error;
14321  }
14322
14323 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
14324 index 6ccb08f57fcb..c8cbb5ed2fe3 100644
14325 --- a/kernel/power/suspend.c
14326 +++ b/kernel/power/suspend.c
14327 @@ -369,6 +369,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
14328         arch_suspend_disable_irqs();
14329         BUG_ON(!irqs_disabled());
14330
14331 +       system_state = SYSTEM_SUSPEND;
14332 +
14333         error = syscore_suspend();
14334         if (!error) {
14335                 *wakeup = pm_wakeup_pending();
14336 @@ -385,6 +387,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
14337                 syscore_resume();
14338         }
14339
14340 +       system_state = SYSTEM_RUNNING;
14341 +
14342         arch_suspend_enable_irqs();
14343         BUG_ON(irqs_disabled());
14344
14345 @@ -527,6 +531,8 @@ static int enter_state(suspend_state_t state)
14346         return error;
14347  }
14348
14349 +bool pm_in_action;
14350 +
14351  /**
14352   * pm_suspend - Externally visible function for suspending the system.
14353   * @state: System sleep state to enter.
14354 @@ -541,6 +547,8 @@ int pm_suspend(suspend_state_t state)
14355         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
14356                 return -EINVAL;
14357
14358 +       pm_in_action = true;
14359 +
14360         error = enter_state(state);
14361         if (error) {
14362                 suspend_stats.fail++;
14363 @@ -548,6 +556,7 @@ int pm_suspend(suspend_state_t state)
14364         } else {
14365                 suspend_stats.success++;
14366         }
14367 +       pm_in_action = false;
14368         return error;
14369  }
14370  EXPORT_SYMBOL(pm_suspend);
14371 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
14372 index f7a55e9ff2f7..9277ee033271 100644
14373 --- a/kernel/printk/printk.c
14374 +++ b/kernel/printk/printk.c
14375 @@ -351,6 +351,65 @@ __packed __aligned(4)
14376   */
14377  DEFINE_RAW_SPINLOCK(logbuf_lock);
14378
14379 +#ifdef CONFIG_EARLY_PRINTK
14380 +struct console *early_console;
14381 +
14382 +static void early_vprintk(const char *fmt, va_list ap)
14383 +{
14384 +       if (early_console) {
14385 +               char buf[512];
14386 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
14387 +
14388 +               early_console->write(early_console, buf, n);
14389 +       }
14390 +}
14391 +
14392 +asmlinkage void early_printk(const char *fmt, ...)
14393 +{
14394 +       va_list ap;
14395 +
14396 +       va_start(ap, fmt);
14397 +       early_vprintk(fmt, ap);
14398 +       va_end(ap);
14399 +}
14400 +
14401 +/*
14402 + * This is independent of any log levels - a global
14403 + * kill switch that turns off all of printk.
14404 + *
14405 + * Used by the NMI watchdog if early-printk is enabled.
14406 + */
14407 +static bool __read_mostly printk_killswitch;
14408 +
14409 +static int __init force_early_printk_setup(char *str)
14410 +{
14411 +       printk_killswitch = true;
14412 +       return 0;
14413 +}
14414 +early_param("force_early_printk", force_early_printk_setup);
14415 +
14416 +void printk_kill(void)
14417 +{
14418 +       printk_killswitch = true;
14419 +}
14420 +
14421 +#ifdef CONFIG_PRINTK
14422 +static int forced_early_printk(const char *fmt, va_list ap)
14423 +{
14424 +       if (!printk_killswitch)
14425 +               return 0;
14426 +       early_vprintk(fmt, ap);
14427 +       return 1;
14428 +}
14429 +#endif
14430 +
14431 +#else
14432 +static inline int forced_early_printk(const char *fmt, va_list ap)
14433 +{
14434 +       return 0;
14435 +}
14436 +#endif
14437 +
14438  #ifdef CONFIG_PRINTK
14439  DECLARE_WAIT_QUEUE_HEAD(log_wait);
14440  /* the next printk record to read by syslog(READ) or /proc/kmsg */
14441 @@ -1337,6 +1396,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14442  {
14443         char *text;
14444         int len = 0;
14445 +       int attempts = 0;
14446
14447         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
14448         if (!text)
14449 @@ -1348,6 +1408,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14450                 u64 seq;
14451                 u32 idx;
14452                 enum log_flags prev;
14453 +               int num_msg;
14454 +try_again:
14455 +               attempts++;
14456 +               if (attempts > 10) {
14457 +                       len = -EBUSY;
14458 +                       goto out;
14459 +               }
14460 +               num_msg = 0;
14461
14462                 /*
14463                  * Find first record that fits, including all following records,
14464 @@ -1363,6 +1431,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14465                         prev = msg->flags;
14466                         idx = log_next(idx);
14467                         seq++;
14468 +                       num_msg++;
14469 +                       if (num_msg > 5) {
14470 +                               num_msg = 0;
14471 +                               raw_spin_unlock_irq(&logbuf_lock);
14472 +                               raw_spin_lock_irq(&logbuf_lock);
14473 +                               if (clear_seq < log_first_seq)
14474 +                                       goto try_again;
14475 +                       }
14476                 }
14477
14478                 /* move first record forward until length fits into the buffer */
14479 @@ -1376,6 +1452,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14480                         prev = msg->flags;
14481                         idx = log_next(idx);
14482                         seq++;
14483 +                       num_msg++;
14484 +                       if (num_msg > 5) {
14485 +                               num_msg = 0;
14486 +                               raw_spin_unlock_irq(&logbuf_lock);
14487 +                               raw_spin_lock_irq(&logbuf_lock);
14488 +                               if (clear_seq < log_first_seq)
14489 +                                       goto try_again;
14490 +                       }
14491                 }
14492
14493                 /* last message fitting into this dump */
14494 @@ -1416,6 +1500,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
14495                 clear_seq = log_next_seq;
14496                 clear_idx = log_next_idx;
14497         }
14498 +out:
14499         raw_spin_unlock_irq(&logbuf_lock);
14500
14501         kfree(text);
14502 @@ -1569,6 +1654,12 @@ static void call_console_drivers(int level,
14503         if (!console_drivers)
14504                 return;
14505
14506 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
14507 +               if (in_irq() || in_nmi())
14508 +                       return;
14509 +       }
14510 +
14511 +       migrate_disable();
14512         for_each_console(con) {
14513                 if (exclusive_console && con != exclusive_console)
14514                         continue;
14515 @@ -1584,6 +1675,7 @@ static void call_console_drivers(int level,
14516                 else
14517                         con->write(con, text, len);
14518         }
14519 +       migrate_enable();
14520  }
14521
14522  /*
14523 @@ -1781,6 +1873,13 @@ asmlinkage int vprintk_emit(int facility, int level,
14524         /* cpu currently holding logbuf_lock in this function */
14525         static unsigned int logbuf_cpu = UINT_MAX;
14526
14527 +       /*
14528 +        * Fall back to early_printk if a debugging subsystem has
14529 +        * killed printk output
14530 +        */
14531 +       if (unlikely(forced_early_printk(fmt, args)))
14532 +               return 1;
14533 +
14534         if (level == LOGLEVEL_SCHED) {
14535                 level = LOGLEVEL_DEFAULT;
14536                 in_sched = true;
14537 @@ -1885,13 +1984,23 @@ asmlinkage int vprintk_emit(int facility, int level,
14538
14539         /* If called from the scheduler, we can not call up(). */
14540         if (!in_sched) {
14541 +               int may_trylock = 1;
14542 +
14543                 lockdep_off();
14544 +#ifdef CONFIG_PREEMPT_RT_FULL
14545 +               /*
14546 +                * we can't take a sleeping lock with IRQs or preeption disabled
14547 +                * so we can't print in these contexts
14548 +                */
14549 +               if (!(preempt_count() == 0 && !irqs_disabled()))
14550 +                       may_trylock = 0;
14551 +#endif
14552                 /*
14553                  * Try to acquire and then immediately release the console
14554                  * semaphore.  The release will print out buffers and wake up
14555                  * /dev/kmsg and syslog() users.
14556                  */
14557 -               if (console_trylock())
14558 +               if (may_trylock && console_trylock())
14559                         console_unlock();
14560                 lockdep_on();
14561         }
14562 @@ -2014,26 +2123,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
14563
14564  #endif /* CONFIG_PRINTK */
14565
14566 -#ifdef CONFIG_EARLY_PRINTK
14567 -struct console *early_console;
14568 -
14569 -asmlinkage __visible void early_printk(const char *fmt, ...)
14570 -{
14571 -       va_list ap;
14572 -       char buf[512];
14573 -       int n;
14574 -
14575 -       if (!early_console)
14576 -               return;
14577 -
14578 -       va_start(ap, fmt);
14579 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
14580 -       va_end(ap);
14581 -
14582 -       early_console->write(early_console, buf, n);
14583 -}
14584 -#endif
14585 -
14586  static int __add_preferred_console(char *name, int idx, char *options,
14587                                    char *brl_options)
14588  {
14589 @@ -2303,11 +2392,16 @@ static void console_cont_flush(char *text, size_t size)
14590                 goto out;
14591
14592         len = cont_print_text(text, size);
14593 +#ifdef CONFIG_PREEMPT_RT_FULL
14594 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
14595 +       call_console_drivers(cont.level, NULL, 0, text, len);
14596 +#else
14597         raw_spin_unlock(&logbuf_lock);
14598         stop_critical_timings();
14599         call_console_drivers(cont.level, NULL, 0, text, len);
14600         start_critical_timings();
14601         local_irq_restore(flags);
14602 +#endif
14603         return;
14604  out:
14605         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
14606 @@ -2431,13 +2525,17 @@ void console_unlock(void)
14607                 console_idx = log_next(console_idx);
14608                 console_seq++;
14609                 console_prev = msg->flags;
14610 +#ifdef CONFIG_PREEMPT_RT_FULL
14611 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
14612 +               call_console_drivers(level, ext_text, ext_len, text, len);
14613 +#else
14614                 raw_spin_unlock(&logbuf_lock);
14615
14616                 stop_critical_timings();        /* don't trace print latency */
14617                 call_console_drivers(level, ext_text, ext_len, text, len);
14618                 start_critical_timings();
14619                 local_irq_restore(flags);
14620 -
14621 +#endif
14622                 if (do_cond_resched)
14623                         cond_resched();
14624         }
14625 @@ -2489,6 +2587,11 @@ void console_unblank(void)
14626  {
14627         struct console *c;
14628
14629 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
14630 +               if (in_irq() || in_nmi())
14631 +                       return;
14632 +       }
14633 +
14634         /*
14635          * console_unblank can no longer be called in interrupt context unless
14636          * oops_in_progress is set to 1..
14637 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
14638 index 49ba7c1ade9d..44f44b47ec07 100644
14639 --- a/kernel/ptrace.c
14640 +++ b/kernel/ptrace.c
14641 @@ -166,7 +166,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
14642
14643         spin_lock_irq(&task->sighand->siglock);
14644         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
14645 -               task->state = __TASK_TRACED;
14646 +               unsigned long flags;
14647 +
14648 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
14649 +               if (task->state & __TASK_TRACED)
14650 +                       task->state = __TASK_TRACED;
14651 +               else
14652 +                       task->saved_state = __TASK_TRACED;
14653 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14654                 ret = true;
14655         }
14656         spin_unlock_irq(&task->sighand->siglock);
14657 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
14658 index bf08fee53dc7..eeb8ce4ad7b6 100644
14659 --- a/kernel/rcu/rcutorture.c
14660 +++ b/kernel/rcu/rcutorture.c
14661 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
14662         .name           = "rcu"
14663  };
14664
14665 +#ifndef CONFIG_PREEMPT_RT_FULL
14666  /*
14667   * Definitions for rcu_bh torture testing.
14668   */
14669 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
14670         .name           = "rcu_bh"
14671  };
14672
14673 +#else
14674 +static struct rcu_torture_ops rcu_bh_ops = {
14675 +       .ttype          = INVALID_RCU_FLAVOR,
14676 +};
14677 +#endif
14678 +
14679  /*
14680   * Don't even think about trying any of these in real life!!!
14681   * The names includes "busted", and they really means it!
14682 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
14683 index 10f62c6f48e7..dbee19478f09 100644
14684 --- a/kernel/rcu/tree.c
14685 +++ b/kernel/rcu/tree.c
14686 @@ -55,6 +55,11 @@
14687  #include <linux/random.h>
14688  #include <linux/trace_events.h>
14689  #include <linux/suspend.h>
14690 +#include <linux/delay.h>
14691 +#include <linux/gfp.h>
14692 +#include <linux/oom.h>
14693 +#include <linux/smpboot.h>
14694 +#include "../time/tick-internal.h"
14695
14696  #include "tree.h"
14697  #include "rcu.h"
14698 @@ -260,6 +265,19 @@ void rcu_sched_qs(void)
14699                            this_cpu_ptr(&rcu_sched_data), true);
14700  }
14701
14702 +#ifdef CONFIG_PREEMPT_RT_FULL
14703 +static void rcu_preempt_qs(void);
14704 +
14705 +void rcu_bh_qs(void)
14706 +{
14707 +       unsigned long flags;
14708 +
14709 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
14710 +       local_irq_save(flags);
14711 +       rcu_preempt_qs();
14712 +       local_irq_restore(flags);
14713 +}
14714 +#else
14715  void rcu_bh_qs(void)
14716  {
14717         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
14718 @@ -269,6 +287,7 @@ void rcu_bh_qs(void)
14719                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
14720         }
14721  }
14722 +#endif
14723
14724  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
14725
14726 @@ -449,11 +468,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
14727  /*
14728   * Return the number of RCU BH batches started thus far for debug & stats.
14729   */
14730 +#ifndef CONFIG_PREEMPT_RT_FULL
14731  unsigned long rcu_batches_started_bh(void)
14732  {
14733         return rcu_bh_state.gpnum;
14734  }
14735  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
14736 +#endif
14737
14738  /*
14739   * Return the number of RCU batches completed thus far for debug & stats.
14740 @@ -473,6 +494,7 @@ unsigned long rcu_batches_completed_sched(void)
14741  }
14742  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
14743
14744 +#ifndef CONFIG_PREEMPT_RT_FULL
14745  /*
14746   * Return the number of RCU BH batches completed thus far for debug & stats.
14747   */
14748 @@ -481,6 +503,7 @@ unsigned long rcu_batches_completed_bh(void)
14749         return rcu_bh_state.completed;
14750  }
14751  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
14752 +#endif
14753
14754  /*
14755   * Return the number of RCU expedited batches completed thus far for
14756 @@ -504,6 +527,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
14757  }
14758  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
14759
14760 +#ifndef CONFIG_PREEMPT_RT_FULL
14761  /*
14762   * Force a quiescent state.
14763   */
14764 @@ -522,6 +546,13 @@ void rcu_bh_force_quiescent_state(void)
14765  }
14766  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
14767
14768 +#else
14769 +void rcu_force_quiescent_state(void)
14770 +{
14771 +}
14772 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
14773 +#endif
14774 +
14775  /*
14776   * Force a quiescent state for RCU-sched.
14777   */
14778 @@ -572,9 +603,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
14779         case RCU_FLAVOR:
14780                 rsp = rcu_state_p;
14781                 break;
14782 +#ifndef CONFIG_PREEMPT_RT_FULL
14783         case RCU_BH_FLAVOR:
14784                 rsp = &rcu_bh_state;
14785                 break;
14786 +#endif
14787         case RCU_SCHED_FLAVOR:
14788                 rsp = &rcu_sched_state;
14789                 break;
14790 @@ -3016,18 +3049,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
14791  /*
14792   * Do RCU core processing for the current CPU.
14793   */
14794 -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
14795 +static __latent_entropy void rcu_process_callbacks(void)
14796  {
14797         struct rcu_state *rsp;
14798
14799         if (cpu_is_offline(smp_processor_id()))
14800                 return;
14801 -       trace_rcu_utilization(TPS("Start RCU core"));
14802         for_each_rcu_flavor(rsp)
14803                 __rcu_process_callbacks(rsp);
14804 -       trace_rcu_utilization(TPS("End RCU core"));
14805  }
14806
14807 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
14808  /*
14809   * Schedule RCU callback invocation.  If the specified type of RCU
14810   * does not support RCU priority boosting, just do a direct call,
14811 @@ -3039,19 +3071,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
14812  {
14813         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
14814                 return;
14815 -       if (likely(!rsp->boost)) {
14816 -               rcu_do_batch(rsp, rdp);
14817 -               return;
14818 -       }
14819 -       invoke_rcu_callbacks_kthread();
14820 +       rcu_do_batch(rsp, rdp);
14821  }
14822
14823 +static void rcu_wake_cond(struct task_struct *t, int status)
14824 +{
14825 +       /*
14826 +        * If the thread is yielding, only wake it when this
14827 +        * is invoked from idle
14828 +        */
14829 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
14830 +               wake_up_process(t);
14831 +}
14832 +
14833 +/*
14834 + * Wake up this CPU's rcuc kthread to do RCU core processing.
14835 + */
14836  static void invoke_rcu_core(void)
14837  {
14838 -       if (cpu_online(smp_processor_id()))
14839 -               raise_softirq(RCU_SOFTIRQ);
14840 +       unsigned long flags;
14841 +       struct task_struct *t;
14842 +
14843 +       if (!cpu_online(smp_processor_id()))
14844 +               return;
14845 +       local_irq_save(flags);
14846 +       __this_cpu_write(rcu_cpu_has_work, 1);
14847 +       t = __this_cpu_read(rcu_cpu_kthread_task);
14848 +       if (t != NULL && current != t)
14849 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
14850 +       local_irq_restore(flags);
14851  }
14852
14853 +static void rcu_cpu_kthread_park(unsigned int cpu)
14854 +{
14855 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
14856 +}
14857 +
14858 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
14859 +{
14860 +       return __this_cpu_read(rcu_cpu_has_work);
14861 +}
14862 +
14863 +/*
14864 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
14865 + * RCU softirq used in flavors and configurations of RCU that do not
14866 + * support RCU priority boosting.
14867 + */
14868 +static void rcu_cpu_kthread(unsigned int cpu)
14869 +{
14870 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
14871 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
14872 +       int spincnt;
14873 +
14874 +       for (spincnt = 0; spincnt < 10; spincnt++) {
14875 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
14876 +               local_bh_disable();
14877 +               *statusp = RCU_KTHREAD_RUNNING;
14878 +               this_cpu_inc(rcu_cpu_kthread_loops);
14879 +               local_irq_disable();
14880 +               work = *workp;
14881 +               *workp = 0;
14882 +               local_irq_enable();
14883 +               if (work)
14884 +                       rcu_process_callbacks();
14885 +               local_bh_enable();
14886 +               if (*workp == 0) {
14887 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
14888 +                       *statusp = RCU_KTHREAD_WAITING;
14889 +                       return;
14890 +               }
14891 +       }
14892 +       *statusp = RCU_KTHREAD_YIELDING;
14893 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
14894 +       schedule_timeout_interruptible(2);
14895 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
14896 +       *statusp = RCU_KTHREAD_WAITING;
14897 +}
14898 +
14899 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
14900 +       .store                  = &rcu_cpu_kthread_task,
14901 +       .thread_should_run      = rcu_cpu_kthread_should_run,
14902 +       .thread_fn              = rcu_cpu_kthread,
14903 +       .thread_comm            = "rcuc/%u",
14904 +       .setup                  = rcu_cpu_kthread_setup,
14905 +       .park                   = rcu_cpu_kthread_park,
14906 +};
14907 +
14908 +/*
14909 + * Spawn per-CPU RCU core processing kthreads.
14910 + */
14911 +static int __init rcu_spawn_core_kthreads(void)
14912 +{
14913 +       int cpu;
14914 +
14915 +       for_each_possible_cpu(cpu)
14916 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
14917 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
14918 +       return 0;
14919 +}
14920 +early_initcall(rcu_spawn_core_kthreads);
14921 +
14922  /*
14923   * Handle any core-RCU processing required by a call_rcu() invocation.
14924   */
14925 @@ -3195,6 +3314,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
14926  }
14927  EXPORT_SYMBOL_GPL(call_rcu_sched);
14928
14929 +#ifndef CONFIG_PREEMPT_RT_FULL
14930  /*
14931   * Queue an RCU callback for invocation after a quicker grace period.
14932   */
14933 @@ -3203,6 +3323,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
14934         __call_rcu(head, func, &rcu_bh_state, -1, 0);
14935  }
14936  EXPORT_SYMBOL_GPL(call_rcu_bh);
14937 +#endif
14938
14939  /*
14940   * Queue an RCU callback for lazy invocation after a grace period.
14941 @@ -3294,6 +3415,7 @@ void synchronize_sched(void)
14942  }
14943  EXPORT_SYMBOL_GPL(synchronize_sched);
14944
14945 +#ifndef CONFIG_PREEMPT_RT_FULL
14946  /**
14947   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
14948   *
14949 @@ -3320,6 +3442,7 @@ void synchronize_rcu_bh(void)
14950                 wait_rcu_gp(call_rcu_bh);
14951  }
14952  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
14953 +#endif
14954
14955  /**
14956   * get_state_synchronize_rcu - Snapshot current RCU state
14957 @@ -3698,6 +3821,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
14958         mutex_unlock(&rsp->barrier_mutex);
14959  }
14960
14961 +#ifndef CONFIG_PREEMPT_RT_FULL
14962  /**
14963   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
14964   */
14965 @@ -3706,6 +3830,7 @@ void rcu_barrier_bh(void)
14966         _rcu_barrier(&rcu_bh_state);
14967  }
14968  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
14969 +#endif
14970
14971  /**
14972   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
14973 @@ -4227,12 +4352,13 @@ void __init rcu_init(void)
14974
14975         rcu_bootup_announce();
14976         rcu_init_geometry();
14977 +#ifndef CONFIG_PREEMPT_RT_FULL
14978         rcu_init_one(&rcu_bh_state);
14979 +#endif
14980         rcu_init_one(&rcu_sched_state);
14981         if (dump_tree)
14982                 rcu_dump_rcu_node_tree(&rcu_sched_state);
14983         __rcu_init_preempt();
14984 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
14985
14986         /*
14987          * We don't need protection against CPU-hotplug here because
14988 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
14989 index e99a5234d9ed..958ac107062c 100644
14990 --- a/kernel/rcu/tree.h
14991 +++ b/kernel/rcu/tree.h
14992 @@ -588,18 +588,18 @@ extern struct list_head rcu_struct_flavors;
14993   */
14994  extern struct rcu_state rcu_sched_state;
14995
14996 +#ifndef CONFIG_PREEMPT_RT_FULL
14997  extern struct rcu_state rcu_bh_state;
14998 +#endif
14999
15000  #ifdef CONFIG_PREEMPT_RCU
15001  extern struct rcu_state rcu_preempt_state;
15002  #endif /* #ifdef CONFIG_PREEMPT_RCU */
15003
15004 -#ifdef CONFIG_RCU_BOOST
15005  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
15006  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
15007  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
15008  DECLARE_PER_CPU(char, rcu_cpu_has_work);
15009 -#endif /* #ifdef CONFIG_RCU_BOOST */
15010
15011  #ifndef RCU_TREE_NONCORE
15012
15013 @@ -619,10 +619,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
15014  static void __init __rcu_init_preempt(void);
15015  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
15016  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
15017 -static void invoke_rcu_callbacks_kthread(void);
15018  static bool rcu_is_callbacks_kthread(void);
15019 +static void rcu_cpu_kthread_setup(unsigned int cpu);
15020  #ifdef CONFIG_RCU_BOOST
15021 -static void rcu_preempt_do_callbacks(void);
15022  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
15023                                                  struct rcu_node *rnp);
15024  #endif /* #ifdef CONFIG_RCU_BOOST */
15025 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
15026 index 56583e764ebf..7c656f8e192f 100644
15027 --- a/kernel/rcu/tree_plugin.h
15028 +++ b/kernel/rcu/tree_plugin.h
15029 @@ -24,25 +24,10 @@
15030   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
15031   */
15032
15033 -#include <linux/delay.h>
15034 -#include <linux/gfp.h>
15035 -#include <linux/oom.h>
15036 -#include <linux/smpboot.h>
15037 -#include "../time/tick-internal.h"
15038 -
15039  #ifdef CONFIG_RCU_BOOST
15040
15041  #include "../locking/rtmutex_common.h"
15042
15043 -/*
15044 - * Control variables for per-CPU and per-rcu_node kthreads.  These
15045 - * handle all flavors of RCU.
15046 - */
15047 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
15048 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
15049 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
15050 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
15051 -
15052  #else /* #ifdef CONFIG_RCU_BOOST */
15053
15054  /*
15055 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
15056
15057  #endif /* #else #ifdef CONFIG_RCU_BOOST */
15058
15059 +/*
15060 + * Control variables for per-CPU and per-rcu_node kthreads.  These
15061 + * handle all flavors of RCU.
15062 + */
15063 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
15064 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
15065 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
15066 +
15067  #ifdef CONFIG_RCU_NOCB_CPU
15068  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
15069  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
15070 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
15071         }
15072
15073         /* Hardware IRQ handlers cannot block, complain if they get here. */
15074 -       if (in_irq() || in_serving_softirq()) {
15075 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
15076                 lockdep_rcu_suspicious(__FILE__, __LINE__,
15077                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
15078                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
15079 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
15080                 t->rcu_read_unlock_special.b.need_qs = true;
15081  }
15082
15083 -#ifdef CONFIG_RCU_BOOST
15084 -
15085 -static void rcu_preempt_do_callbacks(void)
15086 -{
15087 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
15088 -}
15089 -
15090 -#endif /* #ifdef CONFIG_RCU_BOOST */
15091 -
15092  /*
15093   * Queue a preemptible-RCU callback for invocation after a grace period.
15094   */
15095 @@ -829,6 +813,19 @@ void exit_rcu(void)
15096
15097  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
15098
15099 +/*
15100 + * If boosting, set rcuc kthreads to realtime priority.
15101 + */
15102 +static void rcu_cpu_kthread_setup(unsigned int cpu)
15103 +{
15104 +#ifdef CONFIG_RCU_BOOST
15105 +       struct sched_param sp;
15106 +
15107 +       sp.sched_priority = kthread_prio;
15108 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
15109 +#endif /* #ifdef CONFIG_RCU_BOOST */
15110 +}
15111 +
15112  #ifdef CONFIG_RCU_BOOST
15113
15114  #include "../locking/rtmutex_common.h"
15115 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
15116
15117  #endif /* #else #ifdef CONFIG_RCU_TRACE */
15118
15119 -static void rcu_wake_cond(struct task_struct *t, int status)
15120 -{
15121 -       /*
15122 -        * If the thread is yielding, only wake it when this
15123 -        * is invoked from idle
15124 -        */
15125 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
15126 -               wake_up_process(t);
15127 -}
15128 -
15129  /*
15130   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
15131   * or ->boost_tasks, advancing the pointer to the next task in the
15132 @@ -1013,23 +1000,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
15133  }
15134
15135  /*
15136 - * Wake up the per-CPU kthread to invoke RCU callbacks.
15137 - */
15138 -static void invoke_rcu_callbacks_kthread(void)
15139 -{
15140 -       unsigned long flags;
15141 -
15142 -       local_irq_save(flags);
15143 -       __this_cpu_write(rcu_cpu_has_work, 1);
15144 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
15145 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
15146 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
15147 -                             __this_cpu_read(rcu_cpu_kthread_status));
15148 -       }
15149 -       local_irq_restore(flags);
15150 -}
15151 -
15152 -/*
15153   * Is the current CPU running the RCU-callbacks kthread?
15154   * Caller must have preemption disabled.
15155   */
15156 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
15157         return 0;
15158  }
15159
15160 -static void rcu_kthread_do_work(void)
15161 -{
15162 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
15163 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
15164 -       rcu_preempt_do_callbacks();
15165 -}
15166 -
15167 -static void rcu_cpu_kthread_setup(unsigned int cpu)
15168 -{
15169 -       struct sched_param sp;
15170 -
15171 -       sp.sched_priority = kthread_prio;
15172 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
15173 -}
15174 -
15175 -static void rcu_cpu_kthread_park(unsigned int cpu)
15176 -{
15177 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
15178 -}
15179 -
15180 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
15181 -{
15182 -       return __this_cpu_read(rcu_cpu_has_work);
15183 -}
15184 -
15185 -/*
15186 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
15187 - * RCU softirq used in flavors and configurations of RCU that do not
15188 - * support RCU priority boosting.
15189 - */
15190 -static void rcu_cpu_kthread(unsigned int cpu)
15191 -{
15192 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
15193 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
15194 -       int spincnt;
15195 -
15196 -       for (spincnt = 0; spincnt < 10; spincnt++) {
15197 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
15198 -               local_bh_disable();
15199 -               *statusp = RCU_KTHREAD_RUNNING;
15200 -               this_cpu_inc(rcu_cpu_kthread_loops);
15201 -               local_irq_disable();
15202 -               work = *workp;
15203 -               *workp = 0;
15204 -               local_irq_enable();
15205 -               if (work)
15206 -                       rcu_kthread_do_work();
15207 -               local_bh_enable();
15208 -               if (*workp == 0) {
15209 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
15210 -                       *statusp = RCU_KTHREAD_WAITING;
15211 -                       return;
15212 -               }
15213 -       }
15214 -       *statusp = RCU_KTHREAD_YIELDING;
15215 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
15216 -       schedule_timeout_interruptible(2);
15217 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
15218 -       *statusp = RCU_KTHREAD_WAITING;
15219 -}
15220 -
15221  /*
15222   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
15223   * served by the rcu_node in question.  The CPU hotplug lock is still
15224 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
15225         free_cpumask_var(cm);
15226  }
15227
15228 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
15229 -       .store                  = &rcu_cpu_kthread_task,
15230 -       .thread_should_run      = rcu_cpu_kthread_should_run,
15231 -       .thread_fn              = rcu_cpu_kthread,
15232 -       .thread_comm            = "rcuc/%u",
15233 -       .setup                  = rcu_cpu_kthread_setup,
15234 -       .park                   = rcu_cpu_kthread_park,
15235 -};
15236 -
15237  /*
15238   * Spawn boost kthreads -- called as soon as the scheduler is running.
15239   */
15240  static void __init rcu_spawn_boost_kthreads(void)
15241  {
15242         struct rcu_node *rnp;
15243 -       int cpu;
15244 -
15245 -       for_each_possible_cpu(cpu)
15246 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
15247 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
15248         rcu_for_each_leaf_node(rcu_state_p, rnp)
15249                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
15250  }
15251 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
15252         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
15253  }
15254
15255 -static void invoke_rcu_callbacks_kthread(void)
15256 -{
15257 -       WARN_ON_ONCE(1);
15258 -}
15259 -
15260  static bool rcu_is_callbacks_kthread(void)
15261  {
15262         return false;
15263 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
15264
15265  #endif /* #else #ifdef CONFIG_RCU_BOOST */
15266
15267 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
15268 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
15269
15270  /*
15271   * Check to see if any future RCU-related work will need to be done
15272 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
15273         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
15274                ? 0 : rcu_cpu_has_callbacks(NULL);
15275  }
15276 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
15277
15278 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
15279  /*
15280   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
15281   * after it.
15282 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
15283         return cbs_ready;
15284  }
15285
15286 +#ifndef CONFIG_PREEMPT_RT_FULL
15287 +
15288  /*
15289   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
15290   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
15291 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
15292         *nextevt = basemono + dj * TICK_NSEC;
15293         return 0;
15294  }
15295 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
15296
15297  /*
15298   * Prepare a CPU for idle from an RCU perspective.  The first major task
15299 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
15300 index 4f6db7e6a117..ee02e1e1b3e5 100644
15301 --- a/kernel/rcu/update.c
15302 +++ b/kernel/rcu/update.c
15303 @@ -62,7 +62,7 @@
15304  #ifndef CONFIG_TINY_RCU
15305  module_param(rcu_expedited, int, 0);
15306  module_param(rcu_normal, int, 0);
15307 -static int rcu_normal_after_boot;
15308 +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
15309  module_param(rcu_normal_after_boot, int, 0);
15310  #endif /* #ifndef CONFIG_TINY_RCU */
15311
15312 @@ -132,8 +132,7 @@ bool rcu_gp_is_normal(void)
15313  }
15314  EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
15315
15316 -static atomic_t rcu_expedited_nesting =
15317 -       ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
15318 +static atomic_t rcu_expedited_nesting =        ATOMIC_INIT(1);
15319
15320  /*
15321   * Should normal grace-period primitives be expedited?  Intended for
15322 @@ -182,8 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
15323   */
15324  void rcu_end_inkernel_boot(void)
15325  {
15326 -       if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
15327 -               rcu_unexpedite_gp();
15328 +       rcu_unexpedite_gp();
15329         if (rcu_normal_after_boot)
15330                 WRITE_ONCE(rcu_normal, 1);
15331  }
15332 @@ -298,6 +296,7 @@ int rcu_read_lock_held(void)
15333  }
15334  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
15335
15336 +#ifndef CONFIG_PREEMPT_RT_FULL
15337  /**
15338   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
15339   *
15340 @@ -324,6 +323,7 @@ int rcu_read_lock_bh_held(void)
15341         return in_softirq() || irqs_disabled();
15342  }
15343  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
15344 +#endif
15345
15346  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
15347
15348 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
15349 index 5e59b832ae2b..7337a7f60e3f 100644
15350 --- a/kernel/sched/Makefile
15351 +++ b/kernel/sched/Makefile
15352 @@ -17,7 +17,7 @@ endif
15353
15354  obj-y += core.o loadavg.o clock.o cputime.o
15355  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
15356 -obj-y += wait.o swait.o completion.o idle.o
15357 +obj-y += wait.o swait.o swork.o completion.o idle.o
15358  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
15359  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
15360  obj-$(CONFIG_SCHEDSTATS) += stats.o
15361 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
15362 index 8d0f35debf35..b62cf6400fe0 100644
15363 --- a/kernel/sched/completion.c
15364 +++ b/kernel/sched/completion.c
15365 @@ -30,10 +30,10 @@ void complete(struct completion *x)
15366  {
15367         unsigned long flags;
15368
15369 -       spin_lock_irqsave(&x->wait.lock, flags);
15370 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
15371         x->done++;
15372 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
15373 -       spin_unlock_irqrestore(&x->wait.lock, flags);
15374 +       swake_up_locked(&x->wait);
15375 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
15376  }
15377  EXPORT_SYMBOL(complete);
15378
15379 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
15380  {
15381         unsigned long flags;
15382
15383 -       spin_lock_irqsave(&x->wait.lock, flags);
15384 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
15385         x->done += UINT_MAX/2;
15386 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
15387 -       spin_unlock_irqrestore(&x->wait.lock, flags);
15388 +       swake_up_all_locked(&x->wait);
15389 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
15390  }
15391  EXPORT_SYMBOL(complete_all);
15392
15393 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
15394                    long (*action)(long), long timeout, int state)
15395  {
15396         if (!x->done) {
15397 -               DECLARE_WAITQUEUE(wait, current);
15398 +               DECLARE_SWAITQUEUE(wait);
15399
15400 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
15401 +               __prepare_to_swait(&x->wait, &wait);
15402                 do {
15403                         if (signal_pending_state(state, current)) {
15404                                 timeout = -ERESTARTSYS;
15405                                 break;
15406                         }
15407                         __set_current_state(state);
15408 -                       spin_unlock_irq(&x->wait.lock);
15409 +                       raw_spin_unlock_irq(&x->wait.lock);
15410                         timeout = action(timeout);
15411 -                       spin_lock_irq(&x->wait.lock);
15412 +                       raw_spin_lock_irq(&x->wait.lock);
15413                 } while (!x->done && timeout);
15414 -               __remove_wait_queue(&x->wait, &wait);
15415 +               __finish_swait(&x->wait, &wait);
15416                 if (!x->done)
15417                         return timeout;
15418         }
15419 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
15420  {
15421         might_sleep();
15422
15423 -       spin_lock_irq(&x->wait.lock);
15424 +       raw_spin_lock_irq(&x->wait.lock);
15425         timeout = do_wait_for_common(x, action, timeout, state);
15426 -       spin_unlock_irq(&x->wait.lock);
15427 +       raw_spin_unlock_irq(&x->wait.lock);
15428         return timeout;
15429  }
15430
15431 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
15432         if (!READ_ONCE(x->done))
15433                 return 0;
15434
15435 -       spin_lock_irqsave(&x->wait.lock, flags);
15436 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
15437         if (!x->done)
15438                 ret = 0;
15439         else
15440                 x->done--;
15441 -       spin_unlock_irqrestore(&x->wait.lock, flags);
15442 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
15443         return ret;
15444  }
15445  EXPORT_SYMBOL(try_wait_for_completion);
15446 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
15447          * after it's acquired the lock.
15448          */
15449         smp_rmb();
15450 -       spin_unlock_wait(&x->wait.lock);
15451 +       raw_spin_unlock_wait(&x->wait.lock);
15452         return true;
15453  }
15454  EXPORT_SYMBOL(completion_done);
15455 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
15456 index 154fd689fe02..a6aa5801b21e 100644
15457 --- a/kernel/sched/core.c
15458 +++ b/kernel/sched/core.c
15459 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
15460   * Number of tasks to iterate in a single balance run.
15461   * Limited because this is done with IRQs disabled.
15462   */
15463 +#ifndef CONFIG_PREEMPT_RT_FULL
15464  const_debug unsigned int sysctl_sched_nr_migrate = 32;
15465 +#else
15466 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
15467 +#endif
15468
15469  /*
15470   * period over which we average the RT time consumption, measured
15471 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
15472
15473         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
15474         rq->hrtick_timer.function = hrtick;
15475 +       rq->hrtick_timer.irqsafe = 1;
15476  }
15477  #else  /* CONFIG_SCHED_HRTICK */
15478  static inline void hrtick_clear(struct rq *rq)
15479 @@ -449,7 +454,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
15480         head->lastp = &node->next;
15481  }
15482
15483 -void wake_up_q(struct wake_q_head *head)
15484 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
15485  {
15486         struct wake_q_node *node = head->first;
15487
15488 @@ -466,7 +471,10 @@ void wake_up_q(struct wake_q_head *head)
15489                  * wake_up_process() implies a wmb() to pair with the queueing
15490                  * in wake_q_add() so as not to miss wakeups.
15491                  */
15492 -               wake_up_process(task);
15493 +               if (sleeper)
15494 +                       wake_up_lock_sleeper(task);
15495 +               else
15496 +                       wake_up_process(task);
15497                 put_task_struct(task);
15498         }
15499  }
15500 @@ -502,6 +510,38 @@ void resched_curr(struct rq *rq)
15501                 trace_sched_wake_idle_without_ipi(cpu);
15502  }
15503
15504 +#ifdef CONFIG_PREEMPT_LAZY
15505 +void resched_curr_lazy(struct rq *rq)
15506 +{
15507 +       struct task_struct *curr = rq->curr;
15508 +       int cpu;
15509 +
15510 +       if (!sched_feat(PREEMPT_LAZY)) {
15511 +               resched_curr(rq);
15512 +               return;
15513 +       }
15514 +
15515 +       lockdep_assert_held(&rq->lock);
15516 +
15517 +       if (test_tsk_need_resched(curr))
15518 +               return;
15519 +
15520 +       if (test_tsk_need_resched_lazy(curr))
15521 +               return;
15522 +
15523 +       set_tsk_need_resched_lazy(curr);
15524 +
15525 +       cpu = cpu_of(rq);
15526 +       if (cpu == smp_processor_id())
15527 +               return;
15528 +
15529 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
15530 +       smp_mb();
15531 +       if (!tsk_is_polling(curr))
15532 +               smp_send_reschedule(cpu);
15533 +}
15534 +#endif
15535 +
15536  void resched_cpu(int cpu)
15537  {
15538         struct rq *rq = cpu_rq(cpu);
15539 @@ -525,11 +565,14 @@ void resched_cpu(int cpu)
15540   */
15541  int get_nohz_timer_target(void)
15542  {
15543 -       int i, cpu = smp_processor_id();
15544 +       int i, cpu;
15545         struct sched_domain *sd;
15546
15547 +       preempt_disable_rt();
15548 +       cpu = smp_processor_id();
15549 +
15550         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
15551 -               return cpu;
15552 +               goto preempt_en_rt;
15553
15554         rcu_read_lock();
15555         for_each_domain(cpu, sd) {
15556 @@ -548,6 +591,8 @@ int get_nohz_timer_target(void)
15557                 cpu = housekeeping_any_cpu();
15558  unlock:
15559         rcu_read_unlock();
15560 +preempt_en_rt:
15561 +       preempt_enable_rt();
15562         return cpu;
15563  }
15564  /*
15565 @@ -1100,6 +1145,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
15566
15567         lockdep_assert_held(&p->pi_lock);
15568
15569 +       if (__migrate_disabled(p)) {
15570 +               cpumask_copy(&p->cpus_allowed, new_mask);
15571 +               return;
15572 +       }
15573 +
15574         queued = task_on_rq_queued(p);
15575         running = task_current(rq, p);
15576
15577 @@ -1122,6 +1172,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
15578                 set_curr_task(rq, p);
15579  }
15580
15581 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
15582 +static DEFINE_MUTEX(sched_down_mutex);
15583 +static cpumask_t sched_down_cpumask;
15584 +
15585 +void tell_sched_cpu_down_begin(int cpu)
15586 +{
15587 +       mutex_lock(&sched_down_mutex);
15588 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
15589 +       mutex_unlock(&sched_down_mutex);
15590 +}
15591 +
15592 +void tell_sched_cpu_down_done(int cpu)
15593 +{
15594 +       mutex_lock(&sched_down_mutex);
15595 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
15596 +       mutex_unlock(&sched_down_mutex);
15597 +}
15598 +
15599 +/**
15600 + * migrate_me - try to move the current task off this cpu
15601 + *
15602 + * Used by the pin_current_cpu() code to try to get tasks
15603 + * to move off the current CPU as it is going down.
15604 + * It will only move the task if the task isn't pinned to
15605 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
15606 + * and the task has to be in a RUNNING state. Otherwise the
15607 + * movement of the task will wake it up (change its state
15608 + * to running) when the task did not expect it.
15609 + *
15610 + * Returns 1 if it succeeded in moving the current task
15611 + *         0 otherwise.
15612 + */
15613 +int migrate_me(void)
15614 +{
15615 +       struct task_struct *p = current;
15616 +       struct migration_arg arg;
15617 +       struct cpumask *cpumask;
15618 +       struct cpumask *mask;
15619 +       unsigned int dest_cpu;
15620 +       struct rq_flags rf;
15621 +       struct rq *rq;
15622 +
15623 +       /*
15624 +        * We can not migrate tasks bounded to a CPU or tasks not
15625 +        * running. The movement of the task will wake it up.
15626 +        */
15627 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
15628 +               return 0;
15629 +
15630 +       mutex_lock(&sched_down_mutex);
15631 +       rq = task_rq_lock(p, &rf);
15632 +
15633 +       cpumask = this_cpu_ptr(&sched_cpumasks);
15634 +       mask = &p->cpus_allowed;
15635 +
15636 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
15637 +
15638 +       if (!cpumask_weight(cpumask)) {
15639 +               /* It's only on this CPU? */
15640 +               task_rq_unlock(rq, p, &rf);
15641 +               mutex_unlock(&sched_down_mutex);
15642 +               return 0;
15643 +       }
15644 +
15645 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
15646 +
15647 +       arg.task = p;
15648 +       arg.dest_cpu = dest_cpu;
15649 +
15650 +       task_rq_unlock(rq, p, &rf);
15651 +
15652 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
15653 +       tlb_migrate_finish(p->mm);
15654 +       mutex_unlock(&sched_down_mutex);
15655 +
15656 +       return 1;
15657 +}
15658 +
15659  /*
15660   * Change a given task's CPU affinity. Migrate the thread to a
15661   * proper CPU and schedule it away if the CPU it's executing on
15662 @@ -1179,7 +1307,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
15663         }
15664
15665         /* Can the task run on the task's current CPU? If so, we're done */
15666 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
15667 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
15668                 goto out;
15669
15670         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
15671 @@ -1366,6 +1494,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
15672         return ret;
15673  }
15674
15675 +static bool check_task_state(struct task_struct *p, long match_state)
15676 +{
15677 +       bool match = false;
15678 +
15679 +       raw_spin_lock_irq(&p->pi_lock);
15680 +       if (p->state == match_state || p->saved_state == match_state)
15681 +               match = true;
15682 +       raw_spin_unlock_irq(&p->pi_lock);
15683 +
15684 +       return match;
15685 +}
15686 +
15687  /*
15688   * wait_task_inactive - wait for a thread to unschedule.
15689   *
15690 @@ -1410,7 +1550,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
15691                  * is actually now running somewhere else!
15692                  */
15693                 while (task_running(rq, p)) {
15694 -                       if (match_state && unlikely(p->state != match_state))
15695 +                       if (match_state && !check_task_state(p, match_state))
15696                                 return 0;
15697                         cpu_relax();
15698                 }
15699 @@ -1425,7 +1565,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
15700                 running = task_running(rq, p);
15701                 queued = task_on_rq_queued(p);
15702                 ncsw = 0;
15703 -               if (!match_state || p->state == match_state)
15704 +               if (!match_state || p->state == match_state ||
15705 +                   p->saved_state == match_state)
15706                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
15707                 task_rq_unlock(rq, p, &rf);
15708
15709 @@ -1680,10 +1821,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
15710  {
15711         activate_task(rq, p, en_flags);
15712         p->on_rq = TASK_ON_RQ_QUEUED;
15713 -
15714 -       /* if a worker is waking up, notify workqueue */
15715 -       if (p->flags & PF_WQ_WORKER)
15716 -               wq_worker_waking_up(p, cpu_of(rq));
15717  }
15718
15719  /*
15720 @@ -2018,8 +2155,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
15721          */
15722         smp_mb__before_spinlock();
15723         raw_spin_lock_irqsave(&p->pi_lock, flags);
15724 -       if (!(p->state & state))
15725 +       if (!(p->state & state)) {
15726 +               /*
15727 +                * The task might be running due to a spinlock sleeper
15728 +                * wakeup. Check the saved state and set it to running
15729 +                * if the wakeup condition is true.
15730 +                */
15731 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
15732 +                       if (p->saved_state & state) {
15733 +                               p->saved_state = TASK_RUNNING;
15734 +                               success = 1;
15735 +                       }
15736 +               }
15737                 goto out;
15738 +       }
15739 +
15740 +       /*
15741 +        * If this is a regular wakeup, then we can unconditionally
15742 +        * clear the saved state of a "lock sleeper".
15743 +        */
15744 +       if (!(wake_flags & WF_LOCK_SLEEPER))
15745 +               p->saved_state = TASK_RUNNING;
15746
15747         trace_sched_waking(p);
15748
15749 @@ -2102,53 +2258,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
15750  }
15751
15752  /**
15753 - * try_to_wake_up_local - try to wake up a local task with rq lock held
15754 - * @p: the thread to be awakened
15755 - * @cookie: context's cookie for pinning
15756 - *
15757 - * Put @p on the run-queue if it's not already there. The caller must
15758 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
15759 - * the current task.
15760 - */
15761 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
15762 -{
15763 -       struct rq *rq = task_rq(p);
15764 -
15765 -       if (WARN_ON_ONCE(rq != this_rq()) ||
15766 -           WARN_ON_ONCE(p == current))
15767 -               return;
15768 -
15769 -       lockdep_assert_held(&rq->lock);
15770 -
15771 -       if (!raw_spin_trylock(&p->pi_lock)) {
15772 -               /*
15773 -                * This is OK, because current is on_cpu, which avoids it being
15774 -                * picked for load-balance and preemption/IRQs are still
15775 -                * disabled avoiding further scheduler activity on it and we've
15776 -                * not yet picked a replacement task.
15777 -                */
15778 -               lockdep_unpin_lock(&rq->lock, cookie);
15779 -               raw_spin_unlock(&rq->lock);
15780 -               raw_spin_lock(&p->pi_lock);
15781 -               raw_spin_lock(&rq->lock);
15782 -               lockdep_repin_lock(&rq->lock, cookie);
15783 -       }
15784 -
15785 -       if (!(p->state & TASK_NORMAL))
15786 -               goto out;
15787 -
15788 -       trace_sched_waking(p);
15789 -
15790 -       if (!task_on_rq_queued(p))
15791 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
15792 -
15793 -       ttwu_do_wakeup(rq, p, 0, cookie);
15794 -       ttwu_stat(p, smp_processor_id(), 0);
15795 -out:
15796 -       raw_spin_unlock(&p->pi_lock);
15797 -}
15798 -
15799 -/**
15800   * wake_up_process - Wake up a specific process
15801   * @p: The process to be woken up.
15802   *
15803 @@ -2166,6 +2275,18 @@ int wake_up_process(struct task_struct *p)
15804  }
15805  EXPORT_SYMBOL(wake_up_process);
15806
15807 +/**
15808 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
15809 + * @p: The process to be woken up.
15810 + *
15811 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
15812 + * the nature of the wakeup.
15813 + */
15814 +int wake_up_lock_sleeper(struct task_struct *p)
15815 +{
15816 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
15817 +}
15818 +
15819  int wake_up_state(struct task_struct *p, unsigned int state)
15820  {
15821         return try_to_wake_up(p, state, 0);
15822 @@ -2442,6 +2563,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
15823         p->on_cpu = 0;
15824  #endif
15825         init_task_preempt_count(p);
15826 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
15827 +       task_thread_info(p)->preempt_lazy_count = 0;
15828 +#endif
15829  #ifdef CONFIG_SMP
15830         plist_node_init(&p->pushable_tasks, MAX_PRIO);
15831         RB_CLEAR_NODE(&p->pushable_dl_tasks);
15832 @@ -2770,21 +2894,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
15833         finish_arch_post_lock_switch();
15834
15835         fire_sched_in_preempt_notifiers(current);
15836 +       /*
15837 +        * We use mmdrop_delayed() here so we don't have to do the
15838 +        * full __mmdrop() when we are the last user.
15839 +        */
15840         if (mm)
15841 -               mmdrop(mm);
15842 +               mmdrop_delayed(mm);
15843         if (unlikely(prev_state == TASK_DEAD)) {
15844                 if (prev->sched_class->task_dead)
15845                         prev->sched_class->task_dead(prev);
15846
15847 -               /*
15848 -                * Remove function-return probe instances associated with this
15849 -                * task and put them back on the free list.
15850 -                */
15851 -               kprobe_flush_task(prev);
15852 -
15853 -               /* Task is done with its stack. */
15854 -               put_task_stack(prev);
15855 -
15856                 put_task_struct(prev);
15857         }
15858
15859 @@ -3252,6 +3371,77 @@ static inline void schedule_debug(struct task_struct *prev)
15860         schedstat_inc(this_rq()->sched_count);
15861  }
15862
15863 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
15864 +
15865 +void migrate_disable(void)
15866 +{
15867 +       struct task_struct *p = current;
15868 +
15869 +       if (in_atomic() || irqs_disabled()) {
15870 +#ifdef CONFIG_SCHED_DEBUG
15871 +               p->migrate_disable_atomic++;
15872 +#endif
15873 +               return;
15874 +       }
15875 +
15876 +#ifdef CONFIG_SCHED_DEBUG
15877 +       if (unlikely(p->migrate_disable_atomic)) {
15878 +               tracing_off();
15879 +               WARN_ON_ONCE(1);
15880 +       }
15881 +#endif
15882 +
15883 +       if (p->migrate_disable) {
15884 +               p->migrate_disable++;
15885 +               return;
15886 +       }
15887 +
15888 +       preempt_disable();
15889 +       preempt_lazy_disable();
15890 +       pin_current_cpu();
15891 +       p->migrate_disable = 1;
15892 +       preempt_enable();
15893 +}
15894 +EXPORT_SYMBOL(migrate_disable);
15895 +
15896 +void migrate_enable(void)
15897 +{
15898 +       struct task_struct *p = current;
15899 +
15900 +       if (in_atomic() || irqs_disabled()) {
15901 +#ifdef CONFIG_SCHED_DEBUG
15902 +               p->migrate_disable_atomic--;
15903 +#endif
15904 +               return;
15905 +       }
15906 +
15907 +#ifdef CONFIG_SCHED_DEBUG
15908 +       if (unlikely(p->migrate_disable_atomic)) {
15909 +               tracing_off();
15910 +               WARN_ON_ONCE(1);
15911 +       }
15912 +#endif
15913 +       WARN_ON_ONCE(p->migrate_disable <= 0);
15914 +
15915 +       if (p->migrate_disable > 1) {
15916 +               p->migrate_disable--;
15917 +               return;
15918 +       }
15919 +
15920 +       preempt_disable();
15921 +       /*
15922 +        * Clearing migrate_disable causes tsk_cpus_allowed to
15923 +        * show the tasks original cpu affinity.
15924 +        */
15925 +       p->migrate_disable = 0;
15926 +
15927 +       unpin_current_cpu();
15928 +       preempt_enable();
15929 +       preempt_lazy_enable();
15930 +}
15931 +EXPORT_SYMBOL(migrate_enable);
15932 +#endif
15933 +
15934  /*
15935   * Pick up the highest-prio task:
15936   */
15937 @@ -3368,19 +3558,6 @@ static void __sched notrace __schedule(bool preempt)
15938                 } else {
15939                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
15940                         prev->on_rq = 0;
15941 -
15942 -                       /*
15943 -                        * If a worker went to sleep, notify and ask workqueue
15944 -                        * whether it wants to wake up a task to maintain
15945 -                        * concurrency.
15946 -                        */
15947 -                       if (prev->flags & PF_WQ_WORKER) {
15948 -                               struct task_struct *to_wakeup;
15949 -
15950 -                               to_wakeup = wq_worker_sleeping(prev);
15951 -                               if (to_wakeup)
15952 -                                       try_to_wake_up_local(to_wakeup, cookie);
15953 -                       }
15954                 }
15955                 switch_count = &prev->nvcsw;
15956         }
15957 @@ -3390,6 +3567,7 @@ static void __sched notrace __schedule(bool preempt)
15958
15959         next = pick_next_task(rq, prev, cookie);
15960         clear_tsk_need_resched(prev);
15961 +       clear_tsk_need_resched_lazy(prev);
15962         clear_preempt_need_resched();
15963         rq->clock_skip_update = 0;
15964
15965 @@ -3437,9 +3615,20 @@ void __noreturn do_task_dead(void)
15966
15967  static inline void sched_submit_work(struct task_struct *tsk)
15968  {
15969 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
15970 +       if (!tsk->state)
15971                 return;
15972         /*
15973 +        * If a worker went to sleep, notify and ask workqueue whether
15974 +        * it wants to wake up a task to maintain concurrency.
15975 +        */
15976 +       if (tsk->flags & PF_WQ_WORKER)
15977 +               wq_worker_sleeping(tsk);
15978 +
15979 +
15980 +       if (tsk_is_pi_blocked(tsk))
15981 +               return;
15982 +
15983 +       /*
15984          * If we are going to sleep and we have plugged IO queued,
15985          * make sure to submit it to avoid deadlocks.
15986          */
15987 @@ -3447,6 +3636,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
15988                 blk_schedule_flush_plug(tsk);
15989  }
15990
15991 +static void sched_update_worker(struct task_struct *tsk)
15992 +{
15993 +       if (tsk->flags & PF_WQ_WORKER)
15994 +               wq_worker_running(tsk);
15995 +}
15996 +
15997  asmlinkage __visible void __sched schedule(void)
15998  {
15999         struct task_struct *tsk = current;
16000 @@ -3457,6 +3652,7 @@ asmlinkage __visible void __sched schedule(void)
16001                 __schedule(false);
16002                 sched_preempt_enable_no_resched();
16003         } while (need_resched());
16004 +       sched_update_worker(tsk);
16005  }
16006  EXPORT_SYMBOL(schedule);
16007
16008 @@ -3520,6 +3716,30 @@ static void __sched notrace preempt_schedule_common(void)
16009         } while (need_resched());
16010  }
16011
16012 +#ifdef CONFIG_PREEMPT_LAZY
16013 +/*
16014 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
16015 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
16016 + * preempt_lazy_count counter >0.
16017 + */
16018 +static __always_inline int preemptible_lazy(void)
16019 +{
16020 +       if (test_thread_flag(TIF_NEED_RESCHED))
16021 +               return 1;
16022 +       if (current_thread_info()->preempt_lazy_count)
16023 +               return 0;
16024 +       return 1;
16025 +}
16026 +
16027 +#else
16028 +
16029 +static inline int preemptible_lazy(void)
16030 +{
16031 +       return 1;
16032 +}
16033 +
16034 +#endif
16035 +
16036  #ifdef CONFIG_PREEMPT
16037  /*
16038   * this is the entry point to schedule() from in-kernel preemption
16039 @@ -3534,7 +3754,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
16040          */
16041         if (likely(!preemptible()))
16042                 return;
16043 -
16044 +       if (!preemptible_lazy())
16045 +               return;
16046         preempt_schedule_common();
16047  }
16048  NOKPROBE_SYMBOL(preempt_schedule);
16049 @@ -3561,6 +3782,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
16050         if (likely(!preemptible()))
16051                 return;
16052
16053 +       if (!preemptible_lazy())
16054 +               return;
16055 +
16056         do {
16057                 /*
16058                  * Because the function tracer can trace preempt_count_sub()
16059 @@ -3583,7 +3807,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
16060                  * an infinite recursion.
16061                  */
16062                 prev_ctx = exception_enter();
16063 +               /*
16064 +                * The add/subtract must not be traced by the function
16065 +                * tracer. But we still want to account for the
16066 +                * preempt off latency tracer. Since the _notrace versions
16067 +                * of add/subtract skip the accounting for latency tracer
16068 +                * we must force it manually.
16069 +                */
16070 +               start_critical_timings();
16071                 __schedule(true);
16072 +               stop_critical_timings();
16073                 exception_exit(prev_ctx);
16074
16075                 preempt_latency_stop(1);
16076 @@ -4939,6 +5172,7 @@ int __cond_resched_lock(spinlock_t *lock)
16077  }
16078  EXPORT_SYMBOL(__cond_resched_lock);
16079
16080 +#ifndef CONFIG_PREEMPT_RT_FULL
16081  int __sched __cond_resched_softirq(void)
16082  {
16083         BUG_ON(!in_softirq());
16084 @@ -4952,6 +5186,7 @@ int __sched __cond_resched_softirq(void)
16085         return 0;
16086  }
16087  EXPORT_SYMBOL(__cond_resched_softirq);
16088 +#endif
16089
16090  /**
16091   * yield - yield the current processor to other threads.
16092 @@ -5315,7 +5550,9 @@ void init_idle(struct task_struct *idle, int cpu)
16093
16094         /* Set the preempt count _outside_ the spinlocks! */
16095         init_idle_preempt_count(idle, cpu);
16096 -
16097 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
16098 +       task_thread_info(idle)->preempt_lazy_count = 0;
16099 +#endif
16100         /*
16101          * The idle tasks have their own, simple scheduling class:
16102          */
16103 @@ -5458,6 +5695,8 @@ void sched_setnuma(struct task_struct *p, int nid)
16104  #endif /* CONFIG_NUMA_BALANCING */
16105
16106  #ifdef CONFIG_HOTPLUG_CPU
16107 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
16108 +
16109  /*
16110   * Ensures that the idle task is using init_mm right before its cpu goes
16111   * offline.
16112 @@ -5472,7 +5711,12 @@ void idle_task_exit(void)
16113                 switch_mm_irqs_off(mm, &init_mm, current);
16114                 finish_arch_post_lock_switch();
16115         }
16116 -       mmdrop(mm);
16117 +       /*
16118 +        * Defer the cleanup to an alive cpu. On RT we can neither
16119 +        * call mmdrop() nor mmdrop_delayed() from here.
16120 +        */
16121 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
16122 +
16123  }
16124
16125  /*
16126 @@ -7418,6 +7662,10 @@ int sched_cpu_dying(unsigned int cpu)
16127         update_max_interval();
16128         nohz_balance_exit_idle(cpu);
16129         hrtick_clear(rq);
16130 +       if (per_cpu(idle_last_mm, cpu)) {
16131 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
16132 +               per_cpu(idle_last_mm, cpu) = NULL;
16133 +       }
16134         return 0;
16135  }
16136  #endif
16137 @@ -7698,7 +7946,7 @@ void __init sched_init(void)
16138  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
16139  static inline int preempt_count_equals(int preempt_offset)
16140  {
16141 -       int nested = preempt_count() + rcu_preempt_depth();
16142 +       int nested = preempt_count() + sched_rcu_preempt_depth();
16143
16144         return (nested == preempt_offset);
16145  }
16146 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
16147 index 37e2449186c4..26dcaabde8b3 100644
16148 --- a/kernel/sched/deadline.c
16149 +++ b/kernel/sched/deadline.c
16150 @@ -687,6 +687,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
16151
16152         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16153         timer->function = dl_task_timer;
16154 +       timer->irqsafe = 1;
16155  }
16156
16157  static
16158 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
16159 index fa178b62ea79..935224123441 100644
16160 --- a/kernel/sched/debug.c
16161 +++ b/kernel/sched/debug.c
16162 @@ -558,6 +558,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
16163         P(rt_throttled);
16164         PN(rt_time);
16165         PN(rt_runtime);
16166 +#ifdef CONFIG_SMP
16167 +       P(rt_nr_migratory);
16168 +#endif
16169
16170  #undef PN
16171  #undef P
16172 @@ -953,6 +956,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
16173  #endif
16174         P(policy);
16175         P(prio);
16176 +#ifdef CONFIG_PREEMPT_RT_FULL
16177 +       P(migrate_disable);
16178 +#endif
16179 +       P(nr_cpus_allowed);
16180  #undef PN_SCHEDSTAT
16181  #undef PN
16182  #undef __PN
16183 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
16184 index c242944f5cbd..4aeb2e2e41bc 100644
16185 --- a/kernel/sched/fair.c
16186 +++ b/kernel/sched/fair.c
16187 @@ -3518,7 +3518,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
16188         ideal_runtime = sched_slice(cfs_rq, curr);
16189         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
16190         if (delta_exec > ideal_runtime) {
16191 -               resched_curr(rq_of(cfs_rq));
16192 +               resched_curr_lazy(rq_of(cfs_rq));
16193                 /*
16194                  * The current task ran long enough, ensure it doesn't get
16195                  * re-elected due to buddy favours.
16196 @@ -3542,7 +3542,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
16197                 return;
16198
16199         if (delta > ideal_runtime)
16200 -               resched_curr(rq_of(cfs_rq));
16201 +               resched_curr_lazy(rq_of(cfs_rq));
16202  }
16203
16204  static void
16205 @@ -3684,7 +3684,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
16206          * validating it and just reschedule.
16207          */
16208         if (queued) {
16209 -               resched_curr(rq_of(cfs_rq));
16210 +               resched_curr_lazy(rq_of(cfs_rq));
16211                 return;
16212         }
16213         /*
16214 @@ -3866,7 +3866,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
16215          * hierarchy can be throttled
16216          */
16217         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
16218 -               resched_curr(rq_of(cfs_rq));
16219 +               resched_curr_lazy(rq_of(cfs_rq));
16220  }
16221
16222  static __always_inline
16223 @@ -4494,7 +4494,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
16224
16225                 if (delta < 0) {
16226                         if (rq->curr == p)
16227 -                               resched_curr(rq);
16228 +                               resched_curr_lazy(rq);
16229                         return;
16230                 }
16231                 hrtick_start(rq, delta);
16232 @@ -5905,7 +5905,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
16233         return;
16234
16235  preempt:
16236 -       resched_curr(rq);
16237 +       resched_curr_lazy(rq);
16238         /*
16239          * Only set the backward buddy when the current task is still
16240          * on the rq. This can happen when a wakeup gets interleaved
16241 @@ -8631,7 +8631,7 @@ static void task_fork_fair(struct task_struct *p)
16242                  * 'current' within the tree based on its new key value.
16243                  */
16244                 swap(curr->vruntime, se->vruntime);
16245 -               resched_curr(rq);
16246 +               resched_curr_lazy(rq);
16247         }
16248
16249         se->vruntime -= cfs_rq->min_vruntime;
16250 @@ -8655,7 +8655,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
16251          */
16252         if (rq->curr == p) {
16253                 if (p->prio > oldprio)
16254 -                       resched_curr(rq);
16255 +                       resched_curr_lazy(rq);
16256         } else
16257                 check_preempt_curr(rq, p, 0);
16258  }
16259 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
16260 index 69631fa46c2f..6d28fcd08872 100644
16261 --- a/kernel/sched/features.h
16262 +++ b/kernel/sched/features.h
16263 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
16264   */
16265  SCHED_FEAT(NONTASK_CAPACITY, true)
16266
16267 +#ifdef CONFIG_PREEMPT_RT_FULL
16268 +SCHED_FEAT(TTWU_QUEUE, false)
16269 +# ifdef CONFIG_PREEMPT_LAZY
16270 +SCHED_FEAT(PREEMPT_LAZY, true)
16271 +# endif
16272 +#else
16273 +
16274  /*
16275   * Queue remote wakeups on the target CPU and process them
16276   * using the scheduler IPI. Reduces rq->lock contention/bounces.
16277   */
16278  SCHED_FEAT(TTWU_QUEUE, true)
16279 +#endif
16280
16281  #ifdef HAVE_RT_PUSH_IPI
16282  /*
16283 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
16284 index 2516b8df6dbb..2556baa0a97e 100644
16285 --- a/kernel/sched/rt.c
16286 +++ b/kernel/sched/rt.c
16287 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
16288
16289         hrtimer_init(&rt_b->rt_period_timer,
16290                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16291 +       rt_b->rt_period_timer.irqsafe = 1;
16292         rt_b->rt_period_timer.function = sched_rt_period_timer;
16293  }
16294
16295 @@ -101,6 +102,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
16296         rt_rq->push_cpu = nr_cpu_ids;
16297         raw_spin_lock_init(&rt_rq->push_lock);
16298         init_irq_work(&rt_rq->push_work, push_irq_work_func);
16299 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
16300  #endif
16301  #endif /* CONFIG_SMP */
16302         /* We start is dequeued state, because no RT tasks are queued */
16303 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
16304 index 055f935d4421..19324ac27026 100644
16305 --- a/kernel/sched/sched.h
16306 +++ b/kernel/sched/sched.h
16307 @@ -1163,6 +1163,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
16308  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
16309  #define WF_FORK                0x02            /* child wakeup after fork */
16310  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
16311 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
16312
16313  /*
16314   * To aid in avoiding the subversion of "niceness" due to uneven distribution
16315 @@ -1346,6 +1347,15 @@ extern void init_sched_fair_class(void);
16316  extern void resched_curr(struct rq *rq);
16317  extern void resched_cpu(int cpu);
16318
16319 +#ifdef CONFIG_PREEMPT_LAZY
16320 +extern void resched_curr_lazy(struct rq *rq);
16321 +#else
16322 +static inline void resched_curr_lazy(struct rq *rq)
16323 +{
16324 +       resched_curr(rq);
16325 +}
16326 +#endif
16327 +
16328  extern struct rt_bandwidth def_rt_bandwidth;
16329  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
16330
16331 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
16332 index 82f0dff90030..ef027ff3250a 100644
16333 --- a/kernel/sched/swait.c
16334 +++ b/kernel/sched/swait.c
16335 @@ -1,5 +1,6 @@
16336  #include <linux/sched.h>
16337  #include <linux/swait.h>
16338 +#include <linux/suspend.h>
16339
16340  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
16341                              struct lock_class_key *key)
16342 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
16343  }
16344  EXPORT_SYMBOL(swake_up_locked);
16345
16346 +void swake_up_all_locked(struct swait_queue_head *q)
16347 +{
16348 +       struct swait_queue *curr;
16349 +       int wakes = 0;
16350 +
16351 +       while (!list_empty(&q->task_list)) {
16352 +
16353 +               curr = list_first_entry(&q->task_list, typeof(*curr),
16354 +                                       task_list);
16355 +               wake_up_process(curr->task);
16356 +               list_del_init(&curr->task_list);
16357 +               wakes++;
16358 +       }
16359 +       if (pm_in_action)
16360 +               return;
16361 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
16362 +}
16363 +EXPORT_SYMBOL(swake_up_all_locked);
16364 +
16365  void swake_up(struct swait_queue_head *q)
16366  {
16367         unsigned long flags;
16368 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
16369         if (!swait_active(q))
16370                 return;
16371
16372 +       WARN_ON(irqs_disabled());
16373         raw_spin_lock_irq(&q->lock);
16374         list_splice_init(&q->task_list, &tmp);
16375         while (!list_empty(&tmp)) {
16376 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
16377 new file mode 100644
16378 index 000000000000..1950f40ca725
16379 --- /dev/null
16380 +++ b/kernel/sched/swork.c
16381 @@ -0,0 +1,173 @@
16382 +/*
16383 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
16384 + *
16385 + * Provides a framework for enqueuing callbacks from irq context
16386 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
16387 + */
16388 +
16389 +#include <linux/swait.h>
16390 +#include <linux/swork.h>
16391 +#include <linux/kthread.h>
16392 +#include <linux/slab.h>
16393 +#include <linux/spinlock.h>
16394 +#include <linux/export.h>
16395 +
16396 +#define SWORK_EVENT_PENDING     (1 << 0)
16397 +
16398 +static DEFINE_MUTEX(worker_mutex);
16399 +static struct sworker *glob_worker;
16400 +
16401 +struct sworker {
16402 +       struct list_head events;
16403 +       struct swait_queue_head wq;
16404 +
16405 +       raw_spinlock_t lock;
16406 +
16407 +       struct task_struct *task;
16408 +       int refs;
16409 +};
16410 +
16411 +static bool swork_readable(struct sworker *worker)
16412 +{
16413 +       bool r;
16414 +
16415 +       if (kthread_should_stop())
16416 +               return true;
16417 +
16418 +       raw_spin_lock_irq(&worker->lock);
16419 +       r = !list_empty(&worker->events);
16420 +       raw_spin_unlock_irq(&worker->lock);
16421 +
16422 +       return r;
16423 +}
16424 +
16425 +static int swork_kthread(void *arg)
16426 +{
16427 +       struct sworker *worker = arg;
16428 +
16429 +       for (;;) {
16430 +               swait_event_interruptible(worker->wq,
16431 +                                       swork_readable(worker));
16432 +               if (kthread_should_stop())
16433 +                       break;
16434 +
16435 +               raw_spin_lock_irq(&worker->lock);
16436 +               while (!list_empty(&worker->events)) {
16437 +                       struct swork_event *sev;
16438 +
16439 +                       sev = list_first_entry(&worker->events,
16440 +                                       struct swork_event, item);
16441 +                       list_del(&sev->item);
16442 +                       raw_spin_unlock_irq(&worker->lock);
16443 +
16444 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
16445 +                                                        &sev->flags));
16446 +                       sev->func(sev);
16447 +                       raw_spin_lock_irq(&worker->lock);
16448 +               }
16449 +               raw_spin_unlock_irq(&worker->lock);
16450 +       }
16451 +       return 0;
16452 +}
16453 +
16454 +static struct sworker *swork_create(void)
16455 +{
16456 +       struct sworker *worker;
16457 +
16458 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
16459 +       if (!worker)
16460 +               return ERR_PTR(-ENOMEM);
16461 +
16462 +       INIT_LIST_HEAD(&worker->events);
16463 +       raw_spin_lock_init(&worker->lock);
16464 +       init_swait_queue_head(&worker->wq);
16465 +
16466 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
16467 +       if (IS_ERR(worker->task)) {
16468 +               kfree(worker);
16469 +               return ERR_PTR(-ENOMEM);
16470 +       }
16471 +
16472 +       return worker;
16473 +}
16474 +
16475 +static void swork_destroy(struct sworker *worker)
16476 +{
16477 +       kthread_stop(worker->task);
16478 +
16479 +       WARN_ON(!list_empty(&worker->events));
16480 +       kfree(worker);
16481 +}
16482 +
16483 +/**
16484 + * swork_queue - queue swork
16485 + *
16486 + * Returns %false if @work was already on a queue, %true otherwise.
16487 + *
16488 + * The work is queued and processed on a random CPU
16489 + */
16490 +bool swork_queue(struct swork_event *sev)
16491 +{
16492 +       unsigned long flags;
16493 +
16494 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
16495 +               return false;
16496 +
16497 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
16498 +       list_add_tail(&sev->item, &glob_worker->events);
16499 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
16500 +
16501 +       swake_up(&glob_worker->wq);
16502 +       return true;
16503 +}
16504 +EXPORT_SYMBOL_GPL(swork_queue);
16505 +
16506 +/**
16507 + * swork_get - get an instance of the sworker
16508 + *
16509 + * Returns an negative error code if the initialization if the worker did not
16510 + * work, %0 otherwise.
16511 + *
16512 + */
16513 +int swork_get(void)
16514 +{
16515 +       struct sworker *worker;
16516 +
16517 +       mutex_lock(&worker_mutex);
16518 +       if (!glob_worker) {
16519 +               worker = swork_create();
16520 +               if (IS_ERR(worker)) {
16521 +                       mutex_unlock(&worker_mutex);
16522 +                       return -ENOMEM;
16523 +               }
16524 +
16525 +               glob_worker = worker;
16526 +       }
16527 +
16528 +       glob_worker->refs++;
16529 +       mutex_unlock(&worker_mutex);
16530 +
16531 +       return 0;
16532 +}
16533 +EXPORT_SYMBOL_GPL(swork_get);
16534 +
16535 +/**
16536 + * swork_put - puts an instance of the sworker
16537 + *
16538 + * Will destroy the sworker thread. This function must not be called until all
16539 + * queued events have been completed.
16540 + */
16541 +void swork_put(void)
16542 +{
16543 +       mutex_lock(&worker_mutex);
16544 +
16545 +       glob_worker->refs--;
16546 +       if (glob_worker->refs > 0)
16547 +               goto out;
16548 +
16549 +       swork_destroy(glob_worker);
16550 +       glob_worker = NULL;
16551 +out:
16552 +       mutex_unlock(&worker_mutex);
16553 +}
16554 +EXPORT_SYMBOL_GPL(swork_put);
16555 diff --git a/kernel/signal.c b/kernel/signal.c
16556 index 75761acc77cf..ae0773c76bb0 100644
16557 --- a/kernel/signal.c
16558 +++ b/kernel/signal.c
16559 @@ -14,6 +14,7 @@
16560  #include <linux/export.h>
16561  #include <linux/init.h>
16562  #include <linux/sched.h>
16563 +#include <linux/sched/rt.h>
16564  #include <linux/fs.h>
16565  #include <linux/tty.h>
16566  #include <linux/binfmts.h>
16567 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
16568         return false;
16569  }
16570
16571 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
16572 +{
16573 +       struct sigqueue *q = t->sigqueue_cache;
16574 +
16575 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
16576 +               return NULL;
16577 +       return q;
16578 +}
16579 +
16580 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
16581 +{
16582 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
16583 +               return 0;
16584 +       return 1;
16585 +}
16586 +
16587  /*
16588   * allocate a new signal queue record
16589   * - this may be called without locks if and only if t == current, otherwise an
16590   *   appropriate lock must be held to stop the target task from exiting
16591   */
16592  static struct sigqueue *
16593 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
16594 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
16595 +                   int override_rlimit, int fromslab)
16596  {
16597         struct sigqueue *q = NULL;
16598         struct user_struct *user;
16599 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
16600         if (override_rlimit ||
16601             atomic_read(&user->sigpending) <=
16602                         task_rlimit(t, RLIMIT_SIGPENDING)) {
16603 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
16604 +               if (!fromslab)
16605 +                       q = get_task_cache(t);
16606 +               if (!q)
16607 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
16608         } else {
16609                 print_dropped_signal(sig);
16610         }
16611 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
16612         return q;
16613  }
16614
16615 +static struct sigqueue *
16616 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
16617 +                int override_rlimit)
16618 +{
16619 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
16620 +}
16621 +
16622  static void __sigqueue_free(struct sigqueue *q)
16623  {
16624         if (q->flags & SIGQUEUE_PREALLOC)
16625 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
16626         kmem_cache_free(sigqueue_cachep, q);
16627  }
16628
16629 +static void sigqueue_free_current(struct sigqueue *q)
16630 +{
16631 +       struct user_struct *up;
16632 +
16633 +       if (q->flags & SIGQUEUE_PREALLOC)
16634 +               return;
16635 +
16636 +       up = q->user;
16637 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
16638 +               atomic_dec(&up->sigpending);
16639 +               free_uid(up);
16640 +       } else
16641 +                 __sigqueue_free(q);
16642 +}
16643 +
16644  void flush_sigqueue(struct sigpending *queue)
16645  {
16646         struct sigqueue *q;
16647 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
16648  }
16649
16650  /*
16651 + * Called from __exit_signal. Flush tsk->pending and
16652 + * tsk->sigqueue_cache
16653 + */
16654 +void flush_task_sigqueue(struct task_struct *tsk)
16655 +{
16656 +       struct sigqueue *q;
16657 +
16658 +       flush_sigqueue(&tsk->pending);
16659 +
16660 +       q = get_task_cache(tsk);
16661 +       if (q)
16662 +               kmem_cache_free(sigqueue_cachep, q);
16663 +}
16664 +
16665 +/*
16666   * Flush all pending signals for this kthread.
16667   */
16668  void flush_signals(struct task_struct *t)
16669 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
16670  still_pending:
16671                 list_del_init(&first->list);
16672                 copy_siginfo(info, &first->info);
16673 -               __sigqueue_free(first);
16674 +               sigqueue_free_current(first);
16675         } else {
16676                 /*
16677                  * Ok, it wasn't in the queue.  This must be
16678 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
16679  {
16680         int signr;
16681
16682 +       WARN_ON_ONCE(tsk != current);
16683 +
16684         /* We only dequeue private signals from ourselves, we don't let
16685          * signalfd steal them
16686          */
16687 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
16688   * We don't want to have recursive SIGSEGV's etc, for example,
16689   * that is why we also clear SIGNAL_UNKILLABLE.
16690   */
16691 -int
16692 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16693 +static int
16694 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16695  {
16696         unsigned long int flags;
16697         int ret, blocked, ignored;
16698 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16699         return ret;
16700  }
16701
16702 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
16703 +{
16704 +/*
16705 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
16706 + * since it can not enable preemption, and the signal code's spin_locks
16707 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
16708 + * send the signal on exit of the trap.
16709 + */
16710 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
16711 +       if (in_atomic()) {
16712 +               if (WARN_ON_ONCE(t != current))
16713 +                       return 0;
16714 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
16715 +                       return 0;
16716 +
16717 +               if (is_si_special(info)) {
16718 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
16719 +                       t->forced_info.si_signo = sig;
16720 +                       t->forced_info.si_errno = 0;
16721 +                       t->forced_info.si_code = SI_KERNEL;
16722 +                       t->forced_info.si_pid = 0;
16723 +                       t->forced_info.si_uid = 0;
16724 +               } else {
16725 +                       t->forced_info = *info;
16726 +               }
16727 +
16728 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
16729 +               return 0;
16730 +       }
16731 +#endif
16732 +       return do_force_sig_info(sig, info, t);
16733 +}
16734 +
16735  /*
16736   * Nuke all other threads in the group.
16737   */
16738 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
16739                  * Disable interrupts early to avoid deadlocks.
16740                  * See rcu_read_unlock() comment header for details.
16741                  */
16742 -               local_irq_save(*flags);
16743 +               local_irq_save_nort(*flags);
16744                 rcu_read_lock();
16745                 sighand = rcu_dereference(tsk->sighand);
16746                 if (unlikely(sighand == NULL)) {
16747                         rcu_read_unlock();
16748 -                       local_irq_restore(*flags);
16749 +                       local_irq_restore_nort(*flags);
16750                         break;
16751                 }
16752                 /*
16753 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
16754                 }
16755                 spin_unlock(&sighand->siglock);
16756                 rcu_read_unlock();
16757 -               local_irq_restore(*flags);
16758 +               local_irq_restore_nort(*flags);
16759         }
16760
16761         return sighand;
16762 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
16763   */
16764  struct sigqueue *sigqueue_alloc(void)
16765  {
16766 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
16767 +       /* Preallocated sigqueue objects always from the slabcache ! */
16768 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
16769
16770         if (q)
16771                 q->flags |= SIGQUEUE_PREALLOC;
16772 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
16773                 if (gstop_done && ptrace_reparented(current))
16774                         do_notify_parent_cldstop(current, false, why);
16775
16776 -               /*
16777 -                * Don't want to allow preemption here, because
16778 -                * sys_ptrace() needs this task to be inactive.
16779 -                *
16780 -                * XXX: implement read_unlock_no_resched().
16781 -                */
16782 -               preempt_disable();
16783                 read_unlock(&tasklist_lock);
16784 -               preempt_enable_no_resched();
16785                 freezable_schedule();
16786         } else {
16787                 /*
16788 diff --git a/kernel/softirq.c b/kernel/softirq.c
16789 index 744fa611cae0..819bd7cf5ad0 100644
16790 --- a/kernel/softirq.c
16791 +++ b/kernel/softirq.c
16792 @@ -21,10 +21,12 @@
16793  #include <linux/freezer.h>
16794  #include <linux/kthread.h>
16795  #include <linux/rcupdate.h>
16796 +#include <linux/delay.h>
16797  #include <linux/ftrace.h>
16798  #include <linux/smp.h>
16799  #include <linux/smpboot.h>
16800  #include <linux/tick.h>
16801 +#include <linux/locallock.h>
16802  #include <linux/irq.h>
16803
16804  #define CREATE_TRACE_POINTS
16805 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
16806  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
16807
16808  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
16809 +#ifdef CONFIG_PREEMPT_RT_FULL
16810 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
16811 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
16812 +#endif
16813
16814  const char * const softirq_to_name[NR_SOFTIRQS] = {
16815         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
16816         "TASKLET", "SCHED", "HRTIMER", "RCU"
16817  };
16818
16819 +#ifdef CONFIG_NO_HZ_COMMON
16820 +# ifdef CONFIG_PREEMPT_RT_FULL
16821 +
16822 +struct softirq_runner {
16823 +       struct task_struct *runner[NR_SOFTIRQS];
16824 +};
16825 +
16826 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
16827 +
16828 +static inline void softirq_set_runner(unsigned int sirq)
16829 +{
16830 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16831 +
16832 +       sr->runner[sirq] = current;
16833 +}
16834 +
16835 +static inline void softirq_clr_runner(unsigned int sirq)
16836 +{
16837 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16838 +
16839 +       sr->runner[sirq] = NULL;
16840 +}
16841 +
16842 +/*
16843 + * On preempt-rt a softirq running context might be blocked on a
16844 + * lock. There might be no other runnable task on this CPU because the
16845 + * lock owner runs on some other CPU. So we have to go into idle with
16846 + * the pending bit set. Therefor we need to check this otherwise we
16847 + * warn about false positives which confuses users and defeats the
16848 + * whole purpose of this test.
16849 + *
16850 + * This code is called with interrupts disabled.
16851 + */
16852 +void softirq_check_pending_idle(void)
16853 +{
16854 +       static int rate_limit;
16855 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
16856 +       u32 warnpending;
16857 +       int i;
16858 +
16859 +       if (rate_limit >= 10)
16860 +               return;
16861 +
16862 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
16863 +       for (i = 0; i < NR_SOFTIRQS; i++) {
16864 +               struct task_struct *tsk = sr->runner[i];
16865 +
16866 +               /*
16867 +                * The wakeup code in rtmutex.c wakes up the task
16868 +                * _before_ it sets pi_blocked_on to NULL under
16869 +                * tsk->pi_lock. So we need to check for both: state
16870 +                * and pi_blocked_on.
16871 +                */
16872 +               if (tsk) {
16873 +                       raw_spin_lock(&tsk->pi_lock);
16874 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
16875 +                               /* Clear all bits pending in that task */
16876 +                               warnpending &= ~(tsk->softirqs_raised);
16877 +                               warnpending &= ~(1 << i);
16878 +                       }
16879 +                       raw_spin_unlock(&tsk->pi_lock);
16880 +               }
16881 +       }
16882 +
16883 +       if (warnpending) {
16884 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16885 +                      warnpending);
16886 +               rate_limit++;
16887 +       }
16888 +}
16889 +# else
16890 +/*
16891 + * On !PREEMPT_RT we just printk rate limited:
16892 + */
16893 +void softirq_check_pending_idle(void)
16894 +{
16895 +       static int rate_limit;
16896 +
16897 +       if (rate_limit < 10 &&
16898 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
16899 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
16900 +                      local_softirq_pending());
16901 +               rate_limit++;
16902 +       }
16903 +}
16904 +# endif
16905 +
16906 +#else /* !CONFIG_NO_HZ_COMMON */
16907 +static inline void softirq_set_runner(unsigned int sirq) { }
16908 +static inline void softirq_clr_runner(unsigned int sirq) { }
16909 +#endif
16910 +
16911  /*
16912   * we cannot loop indefinitely here to avoid userspace starvation,
16913   * but we also don't want to introduce a worst case 1/HZ latency
16914 @@ -77,6 +175,38 @@ static void wakeup_softirqd(void)
16915                 wake_up_process(tsk);
16916  }
16917
16918 +#ifdef CONFIG_PREEMPT_RT_FULL
16919 +static void wakeup_timer_softirqd(void)
16920 +{
16921 +       /* Interrupts are disabled: no need to stop preemption */
16922 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
16923 +
16924 +       if (tsk && tsk->state != TASK_RUNNING)
16925 +               wake_up_process(tsk);
16926 +}
16927 +#endif
16928 +
16929 +static void handle_softirq(unsigned int vec_nr)
16930 +{
16931 +       struct softirq_action *h = softirq_vec + vec_nr;
16932 +       int prev_count;
16933 +
16934 +       prev_count = preempt_count();
16935 +
16936 +       kstat_incr_softirqs_this_cpu(vec_nr);
16937 +
16938 +       trace_softirq_entry(vec_nr);
16939 +       h->action(h);
16940 +       trace_softirq_exit(vec_nr);
16941 +       if (unlikely(prev_count != preempt_count())) {
16942 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
16943 +                      vec_nr, softirq_to_name[vec_nr], h->action,
16944 +                      prev_count, preempt_count());
16945 +               preempt_count_set(prev_count);
16946 +       }
16947 +}
16948 +
16949 +#ifndef CONFIG_PREEMPT_RT_FULL
16950  /*
16951   * If ksoftirqd is scheduled, we do not want to process pending softirqs
16952   * right now. Let ksoftirqd handle this at its own rate, to get fairness.
16953 @@ -88,6 +218,47 @@ static bool ksoftirqd_running(void)
16954         return tsk && (tsk->state == TASK_RUNNING);
16955  }
16956
16957 +static inline int ksoftirqd_softirq_pending(void)
16958 +{
16959 +       return local_softirq_pending();
16960 +}
16961 +
16962 +static void handle_pending_softirqs(u32 pending)
16963 +{
16964 +       struct softirq_action *h = softirq_vec;
16965 +       int softirq_bit;
16966 +
16967 +       local_irq_enable();
16968 +
16969 +       h = softirq_vec;
16970 +
16971 +       while ((softirq_bit = ffs(pending))) {
16972 +               unsigned int vec_nr;
16973 +
16974 +               h += softirq_bit - 1;
16975 +               vec_nr = h - softirq_vec;
16976 +               handle_softirq(vec_nr);
16977 +
16978 +               h++;
16979 +               pending >>= softirq_bit;
16980 +       }
16981 +
16982 +       rcu_bh_qs();
16983 +       local_irq_disable();
16984 +}
16985 +
16986 +static void run_ksoftirqd(unsigned int cpu)
16987 +{
16988 +       local_irq_disable();
16989 +       if (ksoftirqd_softirq_pending()) {
16990 +               __do_softirq();
16991 +               local_irq_enable();
16992 +               cond_resched_rcu_qs();
16993 +               return;
16994 +       }
16995 +       local_irq_enable();
16996 +}
16997 +
16998  /*
16999   * preempt_count and SOFTIRQ_OFFSET usage:
17000   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
17001 @@ -243,10 +414,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
17002         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
17003         unsigned long old_flags = current->flags;
17004         int max_restart = MAX_SOFTIRQ_RESTART;
17005 -       struct softirq_action *h;
17006         bool in_hardirq;
17007         __u32 pending;
17008 -       int softirq_bit;
17009
17010         /*
17011          * Mask out PF_MEMALLOC s current task context is borrowed for the
17012 @@ -265,36 +434,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
17013         /* Reset the pending bitmask before enabling irqs */
17014         set_softirq_pending(0);
17015
17016 -       local_irq_enable();
17017 -
17018 -       h = softirq_vec;
17019 -
17020 -       while ((softirq_bit = ffs(pending))) {
17021 -               unsigned int vec_nr;
17022 -               int prev_count;
17023 -
17024 -               h += softirq_bit - 1;
17025 -
17026 -               vec_nr = h - softirq_vec;
17027 -               prev_count = preempt_count();
17028 -
17029 -               kstat_incr_softirqs_this_cpu(vec_nr);
17030 -
17031 -               trace_softirq_entry(vec_nr);
17032 -               h->action(h);
17033 -               trace_softirq_exit(vec_nr);
17034 -               if (unlikely(prev_count != preempt_count())) {
17035 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
17036 -                              vec_nr, softirq_to_name[vec_nr], h->action,
17037 -                              prev_count, preempt_count());
17038 -                       preempt_count_set(prev_count);
17039 -               }
17040 -               h++;
17041 -               pending >>= softirq_bit;
17042 -       }
17043 -
17044 -       rcu_bh_qs();
17045 -       local_irq_disable();
17046 +       handle_pending_softirqs(pending);
17047
17048         pending = local_softirq_pending();
17049         if (pending) {
17050 @@ -331,6 +471,309 @@ asmlinkage __visible void do_softirq(void)
17051  }
17052
17053  /*
17054 + * This function must run with irqs disabled!
17055 + */
17056 +void raise_softirq_irqoff(unsigned int nr)
17057 +{
17058 +       __raise_softirq_irqoff(nr);
17059 +
17060 +       /*
17061 +        * If we're in an interrupt or softirq, we're done
17062 +        * (this also catches softirq-disabled code). We will
17063 +        * actually run the softirq once we return from
17064 +        * the irq or softirq.
17065 +        *
17066 +        * Otherwise we wake up ksoftirqd to make sure we
17067 +        * schedule the softirq soon.
17068 +        */
17069 +       if (!in_interrupt())
17070 +               wakeup_softirqd();
17071 +}
17072 +
17073 +void __raise_softirq_irqoff(unsigned int nr)
17074 +{
17075 +       trace_softirq_raise(nr);
17076 +       or_softirq_pending(1UL << nr);
17077 +}
17078 +
17079 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
17080 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
17081 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
17082 +
17083 +#else /* !PREEMPT_RT_FULL */
17084 +
17085 +/*
17086 + * On RT we serialize softirq execution with a cpu local lock per softirq
17087 + */
17088 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
17089 +
17090 +void __init softirq_early_init(void)
17091 +{
17092 +       int i;
17093 +
17094 +       for (i = 0; i < NR_SOFTIRQS; i++)
17095 +               local_irq_lock_init(local_softirq_locks[i]);
17096 +}
17097 +
17098 +static void lock_softirq(int which)
17099 +{
17100 +       local_lock(local_softirq_locks[which]);
17101 +}
17102 +
17103 +static void unlock_softirq(int which)
17104 +{
17105 +       local_unlock(local_softirq_locks[which]);
17106 +}
17107 +
17108 +static void do_single_softirq(int which)
17109 +{
17110 +       unsigned long old_flags = current->flags;
17111 +
17112 +       current->flags &= ~PF_MEMALLOC;
17113 +       vtime_account_irq_enter(current);
17114 +       current->flags |= PF_IN_SOFTIRQ;
17115 +       lockdep_softirq_enter();
17116 +       local_irq_enable();
17117 +       handle_softirq(which);
17118 +       local_irq_disable();
17119 +       lockdep_softirq_exit();
17120 +       current->flags &= ~PF_IN_SOFTIRQ;
17121 +       vtime_account_irq_enter(current);
17122 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
17123 +}
17124 +
17125 +/*
17126 + * Called with interrupts disabled. Process softirqs which were raised
17127 + * in current context (or on behalf of ksoftirqd).
17128 + */
17129 +static void do_current_softirqs(void)
17130 +{
17131 +       while (current->softirqs_raised) {
17132 +               int i = __ffs(current->softirqs_raised);
17133 +               unsigned int pending, mask = (1U << i);
17134 +
17135 +               current->softirqs_raised &= ~mask;
17136 +               local_irq_enable();
17137 +
17138 +               /*
17139 +                * If the lock is contended, we boost the owner to
17140 +                * process the softirq or leave the critical section
17141 +                * now.
17142 +                */
17143 +               lock_softirq(i);
17144 +               local_irq_disable();
17145 +               softirq_set_runner(i);
17146 +               /*
17147 +                * Check with the local_softirq_pending() bits,
17148 +                * whether we need to process this still or if someone
17149 +                * else took care of it.
17150 +                */
17151 +               pending = local_softirq_pending();
17152 +               if (pending & mask) {
17153 +                       set_softirq_pending(pending & ~mask);
17154 +                       do_single_softirq(i);
17155 +               }
17156 +               softirq_clr_runner(i);
17157 +               WARN_ON(current->softirq_nestcnt != 1);
17158 +               local_irq_enable();
17159 +               unlock_softirq(i);
17160 +               local_irq_disable();
17161 +       }
17162 +}
17163 +
17164 +void __local_bh_disable(void)
17165 +{
17166 +       if (++current->softirq_nestcnt == 1)
17167 +               migrate_disable();
17168 +}
17169 +EXPORT_SYMBOL(__local_bh_disable);
17170 +
17171 +void __local_bh_enable(void)
17172 +{
17173 +       if (WARN_ON(current->softirq_nestcnt == 0))
17174 +               return;
17175 +
17176 +       local_irq_disable();
17177 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
17178 +               do_current_softirqs();
17179 +       local_irq_enable();
17180 +
17181 +       if (--current->softirq_nestcnt == 0)
17182 +               migrate_enable();
17183 +}
17184 +EXPORT_SYMBOL(__local_bh_enable);
17185 +
17186 +void _local_bh_enable(void)
17187 +{
17188 +       if (WARN_ON(current->softirq_nestcnt == 0))
17189 +               return;
17190 +       if (--current->softirq_nestcnt == 0)
17191 +               migrate_enable();
17192 +}
17193 +EXPORT_SYMBOL(_local_bh_enable);
17194 +
17195 +int in_serving_softirq(void)
17196 +{
17197 +       return current->flags & PF_IN_SOFTIRQ;
17198 +}
17199 +EXPORT_SYMBOL(in_serving_softirq);
17200 +
17201 +/* Called with preemption disabled */
17202 +static void run_ksoftirqd(unsigned int cpu)
17203 +{
17204 +       local_irq_disable();
17205 +       current->softirq_nestcnt++;
17206 +
17207 +       do_current_softirqs();
17208 +       current->softirq_nestcnt--;
17209 +       local_irq_enable();
17210 +       cond_resched_rcu_qs();
17211 +}
17212 +
17213 +/*
17214 + * Called from netif_rx_ni(). Preemption enabled, but migration
17215 + * disabled. So the cpu can't go away under us.
17216 + */
17217 +void thread_do_softirq(void)
17218 +{
17219 +       if (!in_serving_softirq() && current->softirqs_raised) {
17220 +               current->softirq_nestcnt++;
17221 +               do_current_softirqs();
17222 +               current->softirq_nestcnt--;
17223 +       }
17224 +}
17225 +
17226 +static void do_raise_softirq_irqoff(unsigned int nr)
17227 +{
17228 +       unsigned int mask;
17229 +
17230 +       mask = 1UL << nr;
17231 +
17232 +       trace_softirq_raise(nr);
17233 +       or_softirq_pending(mask);
17234 +
17235 +       /*
17236 +        * If we are not in a hard interrupt and inside a bh disabled
17237 +        * region, we simply raise the flag on current. local_bh_enable()
17238 +        * will make sure that the softirq is executed. Otherwise we
17239 +        * delegate it to ksoftirqd.
17240 +        */
17241 +       if (!in_irq() && current->softirq_nestcnt)
17242 +               current->softirqs_raised |= mask;
17243 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
17244 +               return;
17245 +
17246 +       if (mask & TIMER_SOFTIRQS)
17247 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
17248 +       else
17249 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
17250 +}
17251 +
17252 +static void wakeup_proper_softirq(unsigned int nr)
17253 +{
17254 +       if ((1UL << nr) & TIMER_SOFTIRQS)
17255 +               wakeup_timer_softirqd();
17256 +       else
17257 +               wakeup_softirqd();
17258 +}
17259 +
17260 +void __raise_softirq_irqoff(unsigned int nr)
17261 +{
17262 +       do_raise_softirq_irqoff(nr);
17263 +       if (!in_irq() && !current->softirq_nestcnt)
17264 +               wakeup_proper_softirq(nr);
17265 +}
17266 +
17267 +/*
17268 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
17269 + */
17270 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
17271 +{
17272 +       unsigned int mask;
17273 +
17274 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
17275 +                        !__this_cpu_read(ktimer_softirqd)))
17276 +               return;
17277 +       mask = 1UL << nr;
17278 +
17279 +       trace_softirq_raise(nr);
17280 +       or_softirq_pending(mask);
17281 +       if (mask & TIMER_SOFTIRQS)
17282 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
17283 +       else
17284 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
17285 +       wakeup_proper_softirq(nr);
17286 +}
17287 +
17288 +/*
17289 + * This function must run with irqs disabled!
17290 + */
17291 +void raise_softirq_irqoff(unsigned int nr)
17292 +{
17293 +       do_raise_softirq_irqoff(nr);
17294 +
17295 +       /*
17296 +        * If we're in an hard interrupt we let irq return code deal
17297 +        * with the wakeup of ksoftirqd.
17298 +        */
17299 +       if (in_irq())
17300 +               return;
17301 +       /*
17302 +        * If we are in thread context but outside of a bh disabled
17303 +        * region, we need to wake ksoftirqd as well.
17304 +        *
17305 +        * CHECKME: Some of the places which do that could be wrapped
17306 +        * into local_bh_disable/enable pairs. Though it's unclear
17307 +        * whether this is worth the effort. To find those places just
17308 +        * raise a WARN() if the condition is met.
17309 +        */
17310 +       if (!current->softirq_nestcnt)
17311 +               wakeup_proper_softirq(nr);
17312 +}
17313 +
17314 +static inline int ksoftirqd_softirq_pending(void)
17315 +{
17316 +       return current->softirqs_raised;
17317 +}
17318 +
17319 +static inline void local_bh_disable_nort(void) { }
17320 +static inline void _local_bh_enable_nort(void) { }
17321 +
17322 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
17323 +{
17324 +       /* Take over all but timer pending softirqs when starting */
17325 +       local_irq_disable();
17326 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
17327 +       local_irq_enable();
17328 +}
17329 +
17330 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
17331 +{
17332 +       struct sched_param param = { .sched_priority = 1 };
17333 +
17334 +       sched_setscheduler(current, SCHED_FIFO, &param);
17335 +
17336 +       /* Take over timer pending softirqs when starting */
17337 +       local_irq_disable();
17338 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
17339 +       local_irq_enable();
17340 +}
17341 +
17342 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
17343 +                                                   bool online)
17344 +{
17345 +       struct sched_param param = { .sched_priority = 0 };
17346 +
17347 +       sched_setscheduler(current, SCHED_NORMAL, &param);
17348 +}
17349 +
17350 +static int ktimer_softirqd_should_run(unsigned int cpu)
17351 +{
17352 +       return current->softirqs_raised;
17353 +}
17354 +
17355 +#endif /* PREEMPT_RT_FULL */
17356 +/*
17357   * Enter an interrupt context.
17358   */
17359  void irq_enter(void)
17360 @@ -341,9 +784,9 @@ void irq_enter(void)
17361                  * Prevent raise_softirq from needlessly waking up ksoftirqd
17362                  * here, as softirq will be serviced on return from interrupt.
17363                  */
17364 -               local_bh_disable();
17365 +               local_bh_disable_nort();
17366                 tick_irq_enter();
17367 -               _local_bh_enable();
17368 +               _local_bh_enable_nort();
17369         }
17370
17371         __irq_enter();
17372 @@ -351,6 +794,7 @@ void irq_enter(void)
17373
17374  static inline void invoke_softirq(void)
17375  {
17376 +#ifndef CONFIG_PREEMPT_RT_FULL
17377         if (ksoftirqd_running())
17378                 return;
17379
17380 @@ -373,6 +817,18 @@ static inline void invoke_softirq(void)
17381         } else {
17382                 wakeup_softirqd();
17383         }
17384 +#else /* PREEMPT_RT_FULL */
17385 +       unsigned long flags;
17386 +
17387 +       local_irq_save(flags);
17388 +       if (__this_cpu_read(ksoftirqd) &&
17389 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
17390 +               wakeup_softirqd();
17391 +       if (__this_cpu_read(ktimer_softirqd) &&
17392 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
17393 +               wakeup_timer_softirqd();
17394 +       local_irq_restore(flags);
17395 +#endif
17396  }
17397
17398  static inline void tick_irq_exit(void)
17399 @@ -409,26 +865,6 @@ void irq_exit(void)
17400         trace_hardirq_exit(); /* must be last! */
17401  }
17402
17403 -/*
17404 - * This function must run with irqs disabled!
17405 - */
17406 -inline void raise_softirq_irqoff(unsigned int nr)
17407 -{
17408 -       __raise_softirq_irqoff(nr);
17409 -
17410 -       /*
17411 -        * If we're in an interrupt or softirq, we're done
17412 -        * (this also catches softirq-disabled code). We will
17413 -        * actually run the softirq once we return from
17414 -        * the irq or softirq.
17415 -        *
17416 -        * Otherwise we wake up ksoftirqd to make sure we
17417 -        * schedule the softirq soon.
17418 -        */
17419 -       if (!in_interrupt())
17420 -               wakeup_softirqd();
17421 -}
17422 -
17423  void raise_softirq(unsigned int nr)
17424  {
17425         unsigned long flags;
17426 @@ -438,12 +874,6 @@ void raise_softirq(unsigned int nr)
17427         local_irq_restore(flags);
17428  }
17429
17430 -void __raise_softirq_irqoff(unsigned int nr)
17431 -{
17432 -       trace_softirq_raise(nr);
17433 -       or_softirq_pending(1UL << nr);
17434 -}
17435 -
17436  void open_softirq(int nr, void (*action)(struct softirq_action *))
17437  {
17438         softirq_vec[nr].action = action;
17439 @@ -460,15 +890,45 @@ struct tasklet_head {
17440  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
17441  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
17442
17443 +static void inline
17444 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
17445 +{
17446 +       if (tasklet_trylock(t)) {
17447 +again:
17448 +               /* We may have been preempted before tasklet_trylock
17449 +                * and __tasklet_action may have already run.
17450 +                * So double check the sched bit while the takslet
17451 +                * is locked before adding it to the list.
17452 +                */
17453 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
17454 +                       t->next = NULL;
17455 +                       *head->tail = t;
17456 +                       head->tail = &(t->next);
17457 +                       raise_softirq_irqoff(nr);
17458 +                       tasklet_unlock(t);
17459 +               } else {
17460 +                       /* This is subtle. If we hit the corner case above
17461 +                        * It is possible that we get preempted right here,
17462 +                        * and another task has successfully called
17463 +                        * tasklet_schedule(), then this function, and
17464 +                        * failed on the trylock. Thus we must be sure
17465 +                        * before releasing the tasklet lock, that the
17466 +                        * SCHED_BIT is clear. Otherwise the tasklet
17467 +                        * may get its SCHED_BIT set, but not added to the
17468 +                        * list
17469 +                        */
17470 +                       if (!tasklet_tryunlock(t))
17471 +                               goto again;
17472 +               }
17473 +       }
17474 +}
17475 +
17476  void __tasklet_schedule(struct tasklet_struct *t)
17477  {
17478         unsigned long flags;
17479
17480         local_irq_save(flags);
17481 -       t->next = NULL;
17482 -       *__this_cpu_read(tasklet_vec.tail) = t;
17483 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
17484 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
17485 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
17486         local_irq_restore(flags);
17487  }
17488  EXPORT_SYMBOL(__tasklet_schedule);
17489 @@ -478,10 +938,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
17490         unsigned long flags;
17491
17492         local_irq_save(flags);
17493 -       t->next = NULL;
17494 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
17495 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
17496 -       raise_softirq_irqoff(HI_SOFTIRQ);
17497 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
17498         local_irq_restore(flags);
17499  }
17500  EXPORT_SYMBOL(__tasklet_hi_schedule);
17501 @@ -490,82 +947,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
17502  {
17503         BUG_ON(!irqs_disabled());
17504
17505 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
17506 -       __this_cpu_write(tasklet_hi_vec.head, t);
17507 -       __raise_softirq_irqoff(HI_SOFTIRQ);
17508 +       __tasklet_hi_schedule(t);
17509  }
17510  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
17511
17512 -static __latent_entropy void tasklet_action(struct softirq_action *a)
17513 +void  tasklet_enable(struct tasklet_struct *t)
17514  {
17515 -       struct tasklet_struct *list;
17516 +       if (!atomic_dec_and_test(&t->count))
17517 +               return;
17518 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
17519 +               tasklet_schedule(t);
17520 +}
17521 +EXPORT_SYMBOL(tasklet_enable);
17522
17523 -       local_irq_disable();
17524 -       list = __this_cpu_read(tasklet_vec.head);
17525 -       __this_cpu_write(tasklet_vec.head, NULL);
17526 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
17527 -       local_irq_enable();
17528 +static void __tasklet_action(struct softirq_action *a,
17529 +                            struct tasklet_struct *list)
17530 +{
17531 +       int loops = 1000000;
17532
17533         while (list) {
17534                 struct tasklet_struct *t = list;
17535
17536                 list = list->next;
17537
17538 -               if (tasklet_trylock(t)) {
17539 -                       if (!atomic_read(&t->count)) {
17540 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
17541 -                                                       &t->state))
17542 -                                       BUG();
17543 -                               t->func(t->data);
17544 -                               tasklet_unlock(t);
17545 -                               continue;
17546 -                       }
17547 -                       tasklet_unlock(t);
17548 +               /*
17549 +                * Should always succeed - after a tasklist got on the
17550 +                * list (after getting the SCHED bit set from 0 to 1),
17551 +                * nothing but the tasklet softirq it got queued to can
17552 +                * lock it:
17553 +                */
17554 +               if (!tasklet_trylock(t)) {
17555 +                       WARN_ON(1);
17556 +                       continue;
17557                 }
17558
17559 -               local_irq_disable();
17560                 t->next = NULL;
17561 -               *__this_cpu_read(tasklet_vec.tail) = t;
17562 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
17563 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
17564 -               local_irq_enable();
17565 +
17566 +               /*
17567 +                * If we cannot handle the tasklet because it's disabled,
17568 +                * mark it as pending. tasklet_enable() will later
17569 +                * re-schedule the tasklet.
17570 +                */
17571 +               if (unlikely(atomic_read(&t->count))) {
17572 +out_disabled:
17573 +                       /* implicit unlock: */
17574 +                       wmb();
17575 +                       t->state = TASKLET_STATEF_PENDING;
17576 +                       continue;
17577 +               }
17578 +
17579 +               /*
17580 +                * After this point on the tasklet might be rescheduled
17581 +                * on another CPU, but it can only be added to another
17582 +                * CPU's tasklet list if we unlock the tasklet (which we
17583 +                * dont do yet).
17584 +                */
17585 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
17586 +                       WARN_ON(1);
17587 +
17588 +again:
17589 +               t->func(t->data);
17590 +
17591 +               /*
17592 +                * Try to unlock the tasklet. We must use cmpxchg, because
17593 +                * another CPU might have scheduled or disabled the tasklet.
17594 +                * We only allow the STATE_RUN -> 0 transition here.
17595 +                */
17596 +               while (!tasklet_tryunlock(t)) {
17597 +                       /*
17598 +                        * If it got disabled meanwhile, bail out:
17599 +                        */
17600 +                       if (atomic_read(&t->count))
17601 +                               goto out_disabled;
17602 +                       /*
17603 +                        * If it got scheduled meanwhile, re-execute
17604 +                        * the tasklet function:
17605 +                        */
17606 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
17607 +                               goto again;
17608 +                       if (!--loops) {
17609 +                               printk("hm, tasklet state: %08lx\n", t->state);
17610 +                               WARN_ON(1);
17611 +                               tasklet_unlock(t);
17612 +                               break;
17613 +                       }
17614 +               }
17615         }
17616  }
17617
17618 +static void tasklet_action(struct softirq_action *a)
17619 +{
17620 +       struct tasklet_struct *list;
17621 +
17622 +       local_irq_disable();
17623 +
17624 +       list = __this_cpu_read(tasklet_vec.head);
17625 +       __this_cpu_write(tasklet_vec.head, NULL);
17626 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
17627 +
17628 +       local_irq_enable();
17629 +
17630 +       __tasklet_action(a, list);
17631 +}
17632 +
17633  static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
17634  {
17635         struct tasklet_struct *list;
17636
17637         local_irq_disable();
17638 +
17639         list = __this_cpu_read(tasklet_hi_vec.head);
17640         __this_cpu_write(tasklet_hi_vec.head, NULL);
17641         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
17642 +
17643         local_irq_enable();
17644
17645 -       while (list) {
17646 -               struct tasklet_struct *t = list;
17647 -
17648 -               list = list->next;
17649 -
17650 -               if (tasklet_trylock(t)) {
17651 -                       if (!atomic_read(&t->count)) {
17652 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
17653 -                                                       &t->state))
17654 -                                       BUG();
17655 -                               t->func(t->data);
17656 -                               tasklet_unlock(t);
17657 -                               continue;
17658 -                       }
17659 -                       tasklet_unlock(t);
17660 -               }
17661 -
17662 -               local_irq_disable();
17663 -               t->next = NULL;
17664 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
17665 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
17666 -               __raise_softirq_irqoff(HI_SOFTIRQ);
17667 -               local_irq_enable();
17668 -       }
17669 +       __tasklet_action(a, list);
17670  }
17671
17672  void tasklet_init(struct tasklet_struct *t,
17673 @@ -586,7 +1083,7 @@ void tasklet_kill(struct tasklet_struct *t)
17674
17675         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
17676                 do {
17677 -                       yield();
17678 +                       msleep(1);
17679                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
17680         }
17681         tasklet_unlock_wait(t);
17682 @@ -660,25 +1157,26 @@ void __init softirq_init(void)
17683         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
17684  }
17685
17686 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
17687 +void tasklet_unlock_wait(struct tasklet_struct *t)
17688 +{
17689 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
17690 +               /*
17691 +                * Hack for now to avoid this busy-loop:
17692 +                */
17693 +#ifdef CONFIG_PREEMPT_RT_FULL
17694 +               msleep(1);
17695 +#else
17696 +               barrier();
17697 +#endif
17698 +       }
17699 +}
17700 +EXPORT_SYMBOL(tasklet_unlock_wait);
17701 +#endif
17702 +
17703  static int ksoftirqd_should_run(unsigned int cpu)
17704  {
17705 -       return local_softirq_pending();
17706 -}
17707 -
17708 -static void run_ksoftirqd(unsigned int cpu)
17709 -{
17710 -       local_irq_disable();
17711 -       if (local_softirq_pending()) {
17712 -               /*
17713 -                * We can safely run softirq on inline stack, as we are not deep
17714 -                * in the task stack here.
17715 -                */
17716 -               __do_softirq();
17717 -               local_irq_enable();
17718 -               cond_resched_rcu_qs();
17719 -               return;
17720 -       }
17721 -       local_irq_enable();
17722 +       return ksoftirqd_softirq_pending();
17723  }
17724
17725  #ifdef CONFIG_HOTPLUG_CPU
17726 @@ -745,17 +1243,31 @@ static int takeover_tasklets(unsigned int cpu)
17727
17728  static struct smp_hotplug_thread softirq_threads = {
17729         .store                  = &ksoftirqd,
17730 +       .setup                  = ksoftirqd_set_sched_params,
17731         .thread_should_run      = ksoftirqd_should_run,
17732         .thread_fn              = run_ksoftirqd,
17733         .thread_comm            = "ksoftirqd/%u",
17734  };
17735
17736 +#ifdef CONFIG_PREEMPT_RT_FULL
17737 +static struct smp_hotplug_thread softirq_timer_threads = {
17738 +       .store                  = &ktimer_softirqd,
17739 +       .setup                  = ktimer_softirqd_set_sched_params,
17740 +       .cleanup                = ktimer_softirqd_clr_sched_params,
17741 +       .thread_should_run      = ktimer_softirqd_should_run,
17742 +       .thread_fn              = run_ksoftirqd,
17743 +       .thread_comm            = "ktimersoftd/%u",
17744 +};
17745 +#endif
17746 +
17747  static __init int spawn_ksoftirqd(void)
17748  {
17749         cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
17750                                   takeover_tasklets);
17751         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
17752 -
17753 +#ifdef CONFIG_PREEMPT_RT_FULL
17754 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
17755 +#endif
17756         return 0;
17757  }
17758  early_initcall(spawn_ksoftirqd);
17759 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
17760 index ec9ab2f01489..8b89dbedeaff 100644
17761 --- a/kernel/stop_machine.c
17762 +++ b/kernel/stop_machine.c
17763 @@ -36,7 +36,7 @@ struct cpu_stop_done {
17764  struct cpu_stopper {
17765         struct task_struct      *thread;
17766
17767 -       spinlock_t              lock;
17768 +       raw_spinlock_t          lock;
17769         bool                    enabled;        /* is this stopper enabled? */
17770         struct list_head        works;          /* list of pending works */
17771
17772 @@ -78,14 +78,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
17773         unsigned long flags;
17774         bool enabled;
17775
17776 -       spin_lock_irqsave(&stopper->lock, flags);
17777 +       raw_spin_lock_irqsave(&stopper->lock, flags);
17778         enabled = stopper->enabled;
17779         if (enabled)
17780                 __cpu_stop_queue_work(stopper, work);
17781         else if (work->done)
17782                 cpu_stop_signal_done(work->done);
17783 -       spin_unlock_irqrestore(&stopper->lock, flags);
17784
17785 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
17786         return enabled;
17787  }
17788
17789 @@ -231,8 +231,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
17790         struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
17791         int err;
17792  retry:
17793 -       spin_lock_irq(&stopper1->lock);
17794 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17795 +       raw_spin_lock_irq(&stopper1->lock);
17796 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
17797
17798         err = -ENOENT;
17799         if (!stopper1->enabled || !stopper2->enabled)
17800 @@ -255,8 +255,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
17801         __cpu_stop_queue_work(stopper1, work1);
17802         __cpu_stop_queue_work(stopper2, work2);
17803  unlock:
17804 -       spin_unlock(&stopper2->lock);
17805 -       spin_unlock_irq(&stopper1->lock);
17806 +       raw_spin_unlock(&stopper2->lock);
17807 +       raw_spin_unlock_irq(&stopper1->lock);
17808
17809         if (unlikely(err == -EDEADLK)) {
17810                 while (stop_cpus_in_progress)
17811 @@ -448,9 +448,9 @@ static int cpu_stop_should_run(unsigned int cpu)
17812         unsigned long flags;
17813         int run;
17814
17815 -       spin_lock_irqsave(&stopper->lock, flags);
17816 +       raw_spin_lock_irqsave(&stopper->lock, flags);
17817         run = !list_empty(&stopper->works);
17818 -       spin_unlock_irqrestore(&stopper->lock, flags);
17819 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
17820         return run;
17821  }
17822
17823 @@ -461,13 +461,13 @@ static void cpu_stopper_thread(unsigned int cpu)
17824
17825  repeat:
17826         work = NULL;
17827 -       spin_lock_irq(&stopper->lock);
17828 +       raw_spin_lock_irq(&stopper->lock);
17829         if (!list_empty(&stopper->works)) {
17830                 work = list_first_entry(&stopper->works,
17831                                         struct cpu_stop_work, list);
17832                 list_del_init(&work->list);
17833         }
17834 -       spin_unlock_irq(&stopper->lock);
17835 +       raw_spin_unlock_irq(&stopper->lock);
17836
17837         if (work) {
17838                 cpu_stop_fn_t fn = work->fn;
17839 @@ -475,6 +475,8 @@ static void cpu_stopper_thread(unsigned int cpu)
17840                 struct cpu_stop_done *done = work->done;
17841                 int ret;
17842
17843 +               /* XXX */
17844 +
17845                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
17846                 preempt_count_inc();
17847                 ret = fn(arg);
17848 @@ -541,7 +543,7 @@ static int __init cpu_stop_init(void)
17849         for_each_possible_cpu(cpu) {
17850                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
17851
17852 -               spin_lock_init(&stopper->lock);
17853 +               raw_spin_lock_init(&stopper->lock);
17854                 INIT_LIST_HEAD(&stopper->works);
17855         }
17856
17857 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
17858 index bb5ec425dfe0..8338b14ed3a3 100644
17859 --- a/kernel/time/hrtimer.c
17860 +++ b/kernel/time/hrtimer.c
17861 @@ -53,6 +53,7 @@
17862  #include <asm/uaccess.h>
17863
17864  #include <trace/events/timer.h>
17865 +#include <trace/events/hist.h>
17866
17867  #include "tick-internal.h"
17868
17869 @@ -695,6 +696,29 @@ static void hrtimer_switch_to_hres(void)
17870         retrigger_next_event(NULL);
17871  }
17872
17873 +#ifdef CONFIG_PREEMPT_RT_FULL
17874 +
17875 +static struct swork_event clock_set_delay_work;
17876 +
17877 +static void run_clock_set_delay(struct swork_event *event)
17878 +{
17879 +       clock_was_set();
17880 +}
17881 +
17882 +void clock_was_set_delayed(void)
17883 +{
17884 +       swork_queue(&clock_set_delay_work);
17885 +}
17886 +
17887 +static __init int create_clock_set_delay_thread(void)
17888 +{
17889 +       WARN_ON(swork_get());
17890 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
17891 +       return 0;
17892 +}
17893 +early_initcall(create_clock_set_delay_thread);
17894 +#else /* PREEMPT_RT_FULL */
17895 +
17896  static void clock_was_set_work(struct work_struct *work)
17897  {
17898         clock_was_set();
17899 @@ -710,6 +734,7 @@ void clock_was_set_delayed(void)
17900  {
17901         schedule_work(&hrtimer_work);
17902  }
17903 +#endif
17904
17905  #else
17906
17907 @@ -719,11 +744,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
17908  static inline void hrtimer_switch_to_hres(void) { }
17909  static inline void
17910  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
17911 -static inline int hrtimer_reprogram(struct hrtimer *timer,
17912 -                                   struct hrtimer_clock_base *base)
17913 -{
17914 -       return 0;
17915 -}
17916 +static inline void hrtimer_reprogram(struct hrtimer *timer,
17917 +                                    struct hrtimer_clock_base *base) { }
17918  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
17919  static inline void retrigger_next_event(void *arg) { }
17920
17921 @@ -855,6 +877,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
17922  }
17923  EXPORT_SYMBOL_GPL(hrtimer_forward);
17924
17925 +#ifdef CONFIG_PREEMPT_RT_BASE
17926 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
17927 +
17928 +/**
17929 + * hrtimer_wait_for_timer - Wait for a running timer
17930 + *
17931 + * @timer:     timer to wait for
17932 + *
17933 + * The function waits in case the timers callback function is
17934 + * currently executed on the waitqueue of the timer base. The
17935 + * waitqueue is woken up after the timer callback function has
17936 + * finished execution.
17937 + */
17938 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
17939 +{
17940 +       struct hrtimer_clock_base *base = timer->base;
17941 +
17942 +       if (base && base->cpu_base && !timer->irqsafe)
17943 +               wait_event(base->cpu_base->wait,
17944 +                               !(hrtimer_callback_running(timer)));
17945 +}
17946 +
17947 +#else
17948 +# define wake_up_timer_waiters(b)      do { } while (0)
17949 +#endif
17950 +
17951  /*
17952   * enqueue_hrtimer - internal function to (re)start a timer
17953   *
17954 @@ -896,6 +944,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
17955         if (!(state & HRTIMER_STATE_ENQUEUED))
17956                 return;
17957
17958 +       if (unlikely(!list_empty(&timer->cb_entry))) {
17959 +               list_del_init(&timer->cb_entry);
17960 +               return;
17961 +       }
17962 +
17963         if (!timerqueue_del(&base->active, &timer->node))
17964                 cpu_base->active_bases &= ~(1 << base->index);
17965
17966 @@ -991,7 +1044,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
17967         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
17968
17969         timer_stats_hrtimer_set_start_info(timer);
17970 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
17971 +       {
17972 +               ktime_t now = new_base->get_time();
17973
17974 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
17975 +                       timer->praecox = now;
17976 +               else
17977 +                       timer->praecox = ktime_set(0, 0);
17978 +       }
17979 +#endif
17980         leftmost = enqueue_hrtimer(timer, new_base);
17981         if (!leftmost)
17982                 goto unlock;
17983 @@ -1063,7 +1125,7 @@ int hrtimer_cancel(struct hrtimer *timer)
17984
17985                 if (ret >= 0)
17986                         return ret;
17987 -               cpu_relax();
17988 +               hrtimer_wait_for_timer(timer);
17989         }
17990  }
17991  EXPORT_SYMBOL_GPL(hrtimer_cancel);
17992 @@ -1127,6 +1189,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
17993
17994         base = hrtimer_clockid_to_base(clock_id);
17995         timer->base = &cpu_base->clock_base[base];
17996 +       INIT_LIST_HEAD(&timer->cb_entry);
17997         timerqueue_init(&timer->node);
17998
17999  #ifdef CONFIG_TIMER_STATS
18000 @@ -1167,6 +1230,7 @@ bool hrtimer_active(const struct hrtimer *timer)
18001                 seq = raw_read_seqcount_begin(&cpu_base->seq);
18002
18003                 if (timer->state != HRTIMER_STATE_INACTIVE ||
18004 +                   cpu_base->running_soft == timer ||
18005                     cpu_base->running == timer)
18006                         return true;
18007
18008 @@ -1265,10 +1329,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
18009         cpu_base->running = NULL;
18010  }
18011
18012 +#ifdef CONFIG_PREEMPT_RT_BASE
18013 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
18014 +                                struct hrtimer_clock_base *base)
18015 +{
18016 +       int leftmost;
18017 +
18018 +       if (restart != HRTIMER_NORESTART &&
18019 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
18020 +
18021 +               leftmost = enqueue_hrtimer(timer, base);
18022 +               if (!leftmost)
18023 +                       return;
18024 +#ifdef CONFIG_HIGH_RES_TIMERS
18025 +               if (!hrtimer_is_hres_active(timer)) {
18026 +                       /*
18027 +                        * Kick to reschedule the next tick to handle the new timer
18028 +                        * on dynticks target.
18029 +                        */
18030 +                       if (base->cpu_base->nohz_active)
18031 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
18032 +               } else {
18033 +
18034 +                       hrtimer_reprogram(timer, base);
18035 +               }
18036 +#endif
18037 +       }
18038 +}
18039 +
18040 +/*
18041 + * The changes in mainline which removed the callback modes from
18042 + * hrtimer are not yet working with -rt. The non wakeup_process()
18043 + * based callbacks which involve sleeping locks need to be treated
18044 + * seperately.
18045 + */
18046 +static void hrtimer_rt_run_pending(void)
18047 +{
18048 +       enum hrtimer_restart (*fn)(struct hrtimer *);
18049 +       struct hrtimer_cpu_base *cpu_base;
18050 +       struct hrtimer_clock_base *base;
18051 +       struct hrtimer *timer;
18052 +       int index, restart;
18053 +
18054 +       local_irq_disable();
18055 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
18056 +
18057 +       raw_spin_lock(&cpu_base->lock);
18058 +
18059 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
18060 +               base = &cpu_base->clock_base[index];
18061 +
18062 +               while (!list_empty(&base->expired)) {
18063 +                       timer = list_first_entry(&base->expired,
18064 +                                                struct hrtimer, cb_entry);
18065 +
18066 +                       /*
18067 +                        * Same as the above __run_hrtimer function
18068 +                        * just we run with interrupts enabled.
18069 +                        */
18070 +                       debug_deactivate(timer);
18071 +                       cpu_base->running_soft = timer;
18072 +                       raw_write_seqcount_barrier(&cpu_base->seq);
18073 +
18074 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
18075 +                       timer_stats_account_hrtimer(timer);
18076 +                       fn = timer->function;
18077 +
18078 +                       raw_spin_unlock_irq(&cpu_base->lock);
18079 +                       restart = fn(timer);
18080 +                       raw_spin_lock_irq(&cpu_base->lock);
18081 +
18082 +                       hrtimer_rt_reprogram(restart, timer, base);
18083 +                       raw_write_seqcount_barrier(&cpu_base->seq);
18084 +
18085 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
18086 +                       cpu_base->running_soft = NULL;
18087 +               }
18088 +       }
18089 +
18090 +       raw_spin_unlock_irq(&cpu_base->lock);
18091 +
18092 +       wake_up_timer_waiters(cpu_base);
18093 +}
18094 +
18095 +static int hrtimer_rt_defer(struct hrtimer *timer)
18096 +{
18097 +       if (timer->irqsafe)
18098 +               return 0;
18099 +
18100 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
18101 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
18102 +       return 1;
18103 +}
18104 +
18105 +#else
18106 +
18107 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
18108 +
18109 +#endif
18110 +
18111 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
18112 +
18113  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
18114  {
18115         struct hrtimer_clock_base *base = cpu_base->clock_base;
18116         unsigned int active = cpu_base->active_bases;
18117 +       int raise = 0;
18118
18119         for (; active; base++, active >>= 1) {
18120                 struct timerqueue_node *node;
18121 @@ -1284,6 +1450,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
18122
18123                         timer = container_of(node, struct hrtimer, node);
18124
18125 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
18126 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
18127 +                               timer->praecox : hrtimer_get_expires(timer),
18128 +                               basenow)),
18129 +                           current,
18130 +                           timer->function == hrtimer_wakeup ?
18131 +                           container_of(timer, struct hrtimer_sleeper,
18132 +                               timer)->task : NULL);
18133 +
18134                         /*
18135                          * The immediate goal for using the softexpires is
18136                          * minimizing wakeups, not running timers at the
18137 @@ -1299,9 +1474,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
18138                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
18139                                 break;
18140
18141 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
18142 +                       if (!hrtimer_rt_defer(timer))
18143 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
18144 +                       else
18145 +                               raise = 1;
18146                 }
18147         }
18148 +       if (raise)
18149 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
18150  }
18151
18152  #ifdef CONFIG_HIGH_RES_TIMERS
18153 @@ -1464,16 +1644,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
18154  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
18155  {
18156         sl->timer.function = hrtimer_wakeup;
18157 +       sl->timer.irqsafe = 1;
18158         sl->task = task;
18159  }
18160  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
18161
18162 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
18163 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
18164 +                               unsigned long state)
18165  {
18166         hrtimer_init_sleeper(t, current);
18167
18168         do {
18169 -               set_current_state(TASK_INTERRUPTIBLE);
18170 +               set_current_state(state);
18171                 hrtimer_start_expires(&t->timer, mode);
18172
18173                 if (likely(t->task))
18174 @@ -1515,7 +1697,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
18175                                 HRTIMER_MODE_ABS);
18176         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
18177
18178 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
18179 +       /* cpu_chill() does not care about restart state. */
18180 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
18181                 goto out;
18182
18183         rmtp = restart->nanosleep.rmtp;
18184 @@ -1532,8 +1715,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
18185         return ret;
18186  }
18187
18188 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
18189 -                      const enum hrtimer_mode mode, const clockid_t clockid)
18190 +static long
18191 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
18192 +                   const enum hrtimer_mode mode, const clockid_t clockid,
18193 +                   unsigned long state)
18194  {
18195         struct restart_block *restart;
18196         struct hrtimer_sleeper t;
18197 @@ -1546,7 +1731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
18198
18199         hrtimer_init_on_stack(&t.timer, clockid, mode);
18200         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
18201 -       if (do_nanosleep(&t, mode))
18202 +       if (do_nanosleep(&t, mode, state))
18203                 goto out;
18204
18205         /* Absolute timers do not update the rmtp value and restart: */
18206 @@ -1573,6 +1758,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
18207         return ret;
18208  }
18209
18210 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
18211 +                      const enum hrtimer_mode mode, const clockid_t clockid)
18212 +{
18213 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
18214 +}
18215 +
18216  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
18217                 struct timespec __user *, rmtp)
18218  {
18219 @@ -1587,6 +1778,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
18220         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
18221  }
18222
18223 +#ifdef CONFIG_PREEMPT_RT_FULL
18224 +/*
18225 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
18226 + */
18227 +void cpu_chill(void)
18228 +{
18229 +       struct timespec tu = {
18230 +               .tv_nsec = NSEC_PER_MSEC,
18231 +       };
18232 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
18233 +
18234 +       current->flags |= PF_NOFREEZE;
18235 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
18236 +                           TASK_UNINTERRUPTIBLE);
18237 +       if (!freeze_flag)
18238 +               current->flags &= ~PF_NOFREEZE;
18239 +}
18240 +EXPORT_SYMBOL(cpu_chill);
18241 +#endif
18242 +
18243  /*
18244   * Functions related to boot-time initialization:
18245   */
18246 @@ -1598,10 +1809,14 @@ int hrtimers_prepare_cpu(unsigned int cpu)
18247         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
18248                 cpu_base->clock_base[i].cpu_base = cpu_base;
18249                 timerqueue_init_head(&cpu_base->clock_base[i].active);
18250 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
18251         }
18252
18253         cpu_base->cpu = cpu;
18254         hrtimer_init_hres(cpu_base);
18255 +#ifdef CONFIG_PREEMPT_RT_BASE
18256 +       init_waitqueue_head(&cpu_base->wait);
18257 +#endif
18258         return 0;
18259  }
18260
18261 @@ -1671,9 +1886,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
18262
18263  #endif /* CONFIG_HOTPLUG_CPU */
18264
18265 +#ifdef CONFIG_PREEMPT_RT_BASE
18266 +
18267 +static void run_hrtimer_softirq(struct softirq_action *h)
18268 +{
18269 +       hrtimer_rt_run_pending();
18270 +}
18271 +
18272 +static void hrtimers_open_softirq(void)
18273 +{
18274 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
18275 +}
18276 +
18277 +#else
18278 +static void hrtimers_open_softirq(void) { }
18279 +#endif
18280 +
18281  void __init hrtimers_init(void)
18282  {
18283         hrtimers_prepare_cpu(smp_processor_id());
18284 +       hrtimers_open_softirq();
18285  }
18286
18287  /**
18288 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
18289 index 1d5c7204ddc9..184de6751180 100644
18290 --- a/kernel/time/itimer.c
18291 +++ b/kernel/time/itimer.c
18292 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
18293                 /* We are sharing ->siglock with it_real_fn() */
18294                 if (hrtimer_try_to_cancel(timer) < 0) {
18295                         spin_unlock_irq(&tsk->sighand->siglock);
18296 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
18297                         goto again;
18298                 }
18299                 expires = timeval_to_ktime(value->it_value);
18300 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
18301 index 555e21f7b966..a5d6435fabbb 100644
18302 --- a/kernel/time/jiffies.c
18303 +++ b/kernel/time/jiffies.c
18304 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
18305         .max_cycles     = 10,
18306  };
18307
18308 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
18309 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
18310 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
18311
18312  #if (BITS_PER_LONG < 64)
18313  u64 get_jiffies_64(void)
18314 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
18315         u64 ret;
18316
18317         do {
18318 -               seq = read_seqbegin(&jiffies_lock);
18319 +               seq = read_seqcount_begin(&jiffies_seq);
18320                 ret = jiffies_64;
18321 -       } while (read_seqretry(&jiffies_lock, seq));
18322 +       } while (read_seqcount_retry(&jiffies_seq, seq));
18323         return ret;
18324  }
18325  EXPORT_SYMBOL(get_jiffies_64);
18326 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
18327 index 6df8927c58a5..05b7391bf9bd 100644
18328 --- a/kernel/time/ntp.c
18329 +++ b/kernel/time/ntp.c
18330 @@ -17,6 +17,7 @@
18331  #include <linux/module.h>
18332  #include <linux/rtc.h>
18333  #include <linux/math64.h>
18334 +#include <linux/swork.h>
18335
18336  #include "ntp_internal.h"
18337  #include "timekeeping_internal.h"
18338 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
18339                            &sync_cmos_work, timespec64_to_jiffies(&next));
18340  }
18341
18342 +#ifdef CONFIG_PREEMPT_RT_FULL
18343 +
18344 +static void run_clock_set_delay(struct swork_event *event)
18345 +{
18346 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
18347 +}
18348 +
18349 +static struct swork_event ntp_cmos_swork;
18350 +
18351 +void ntp_notify_cmos_timer(void)
18352 +{
18353 +       swork_queue(&ntp_cmos_swork);
18354 +}
18355 +
18356 +static __init int create_cmos_delay_thread(void)
18357 +{
18358 +       WARN_ON(swork_get());
18359 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
18360 +       return 0;
18361 +}
18362 +early_initcall(create_cmos_delay_thread);
18363 +
18364 +#else
18365 +
18366  void ntp_notify_cmos_timer(void)
18367  {
18368         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
18369  }
18370 +#endif /* CONFIG_PREEMPT_RT_FULL */
18371
18372  #else
18373  void ntp_notify_cmos_timer(void) { }
18374 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
18375 index 39008d78927a..633f4eaca9e7 100644
18376 --- a/kernel/time/posix-cpu-timers.c
18377 +++ b/kernel/time/posix-cpu-timers.c
18378 @@ -3,6 +3,7 @@
18379   */
18380
18381  #include <linux/sched.h>
18382 +#include <linux/sched/rt.h>
18383  #include <linux/posix-timers.h>
18384  #include <linux/errno.h>
18385  #include <linux/math64.h>
18386 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
18387         /*
18388          * Disarm any old timer after extracting its expiry time.
18389          */
18390 -       WARN_ON_ONCE(!irqs_disabled());
18391 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
18392
18393         ret = 0;
18394         old_incr = timer->it.cpu.incr;
18395 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
18396         /*
18397          * Now re-arm for the new expiry time.
18398          */
18399 -       WARN_ON_ONCE(!irqs_disabled());
18400 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
18401         arm_timer(timer);
18402         unlock_task_sighand(p, &flags);
18403
18404 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
18405   * already updated our counts.  We need to check if any timers fire now.
18406   * Interrupts are disabled.
18407   */
18408 -void run_posix_cpu_timers(struct task_struct *tsk)
18409 +static void __run_posix_cpu_timers(struct task_struct *tsk)
18410  {
18411         LIST_HEAD(firing);
18412         struct k_itimer *timer, *next;
18413         unsigned long flags;
18414
18415 -       WARN_ON_ONCE(!irqs_disabled());
18416 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
18417
18418         /*
18419          * The fast path checks that there are no expired thread or thread
18420 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
18421         }
18422  }
18423
18424 +#ifdef CONFIG_PREEMPT_RT_BASE
18425 +#include <linux/kthread.h>
18426 +#include <linux/cpu.h>
18427 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
18428 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
18429 +
18430 +static int posix_cpu_timers_thread(void *data)
18431 +{
18432 +       int cpu = (long)data;
18433 +
18434 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
18435 +
18436 +       while (!kthread_should_stop()) {
18437 +               struct task_struct *tsk = NULL;
18438 +               struct task_struct *next = NULL;
18439 +
18440 +               if (cpu_is_offline(cpu))
18441 +                       goto wait_to_die;
18442 +
18443 +               /* grab task list */
18444 +               raw_local_irq_disable();
18445 +               tsk = per_cpu(posix_timer_tasklist, cpu);
18446 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
18447 +               raw_local_irq_enable();
18448 +
18449 +               /* its possible the list is empty, just return */
18450 +               if (!tsk) {
18451 +                       set_current_state(TASK_INTERRUPTIBLE);
18452 +                       schedule();
18453 +                       __set_current_state(TASK_RUNNING);
18454 +                       continue;
18455 +               }
18456 +
18457 +               /* Process task list */
18458 +               while (1) {
18459 +                       /* save next */
18460 +                       next = tsk->posix_timer_list;
18461 +
18462 +                       /* run the task timers, clear its ptr and
18463 +                        * unreference it
18464 +                        */
18465 +                       __run_posix_cpu_timers(tsk);
18466 +                       tsk->posix_timer_list = NULL;
18467 +                       put_task_struct(tsk);
18468 +
18469 +                       /* check if this is the last on the list */
18470 +                       if (next == tsk)
18471 +                               break;
18472 +                       tsk = next;
18473 +               }
18474 +       }
18475 +       return 0;
18476 +
18477 +wait_to_die:
18478 +       /* Wait for kthread_stop */
18479 +       set_current_state(TASK_INTERRUPTIBLE);
18480 +       while (!kthread_should_stop()) {
18481 +               schedule();
18482 +               set_current_state(TASK_INTERRUPTIBLE);
18483 +       }
18484 +       __set_current_state(TASK_RUNNING);
18485 +       return 0;
18486 +}
18487 +
18488 +static inline int __fastpath_timer_check(struct task_struct *tsk)
18489 +{
18490 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
18491 +       if (unlikely(tsk->exit_state))
18492 +               return 0;
18493 +
18494 +       if (!task_cputime_zero(&tsk->cputime_expires))
18495 +                       return 1;
18496 +
18497 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
18498 +                       return 1;
18499 +
18500 +       return 0;
18501 +}
18502 +
18503 +void run_posix_cpu_timers(struct task_struct *tsk)
18504 +{
18505 +       unsigned long cpu = smp_processor_id();
18506 +       struct task_struct *tasklist;
18507 +
18508 +       BUG_ON(!irqs_disabled());
18509 +       if(!per_cpu(posix_timer_task, cpu))
18510 +               return;
18511 +       /* get per-cpu references */
18512 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
18513 +
18514 +       /* check to see if we're already queued */
18515 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
18516 +               get_task_struct(tsk);
18517 +               if (tasklist) {
18518 +                       tsk->posix_timer_list = tasklist;
18519 +               } else {
18520 +                       /*
18521 +                        * The list is terminated by a self-pointing
18522 +                        * task_struct
18523 +                        */
18524 +                       tsk->posix_timer_list = tsk;
18525 +               }
18526 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
18527 +
18528 +               wake_up_process(per_cpu(posix_timer_task, cpu));
18529 +       }
18530 +}
18531 +
18532 +/*
18533 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
18534 + * Here we can start up the necessary migration thread for the new CPU.
18535 + */
18536 +static int posix_cpu_thread_call(struct notifier_block *nfb,
18537 +                                unsigned long action, void *hcpu)
18538 +{
18539 +       int cpu = (long)hcpu;
18540 +       struct task_struct *p;
18541 +       struct sched_param param;
18542 +
18543 +       switch (action) {
18544 +       case CPU_UP_PREPARE:
18545 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
18546 +                                       "posixcputmr/%d",cpu);
18547 +               if (IS_ERR(p))
18548 +                       return NOTIFY_BAD;
18549 +               p->flags |= PF_NOFREEZE;
18550 +               kthread_bind(p, cpu);
18551 +               /* Must be high prio to avoid getting starved */
18552 +               param.sched_priority = MAX_RT_PRIO-1;
18553 +               sched_setscheduler(p, SCHED_FIFO, &param);
18554 +               per_cpu(posix_timer_task,cpu) = p;
18555 +               break;
18556 +       case CPU_ONLINE:
18557 +               /* Strictly unneccessary, as first user will wake it. */
18558 +               wake_up_process(per_cpu(posix_timer_task,cpu));
18559 +               break;
18560 +#ifdef CONFIG_HOTPLUG_CPU
18561 +       case CPU_UP_CANCELED:
18562 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
18563 +               kthread_bind(per_cpu(posix_timer_task, cpu),
18564 +                            cpumask_any(cpu_online_mask));
18565 +               kthread_stop(per_cpu(posix_timer_task,cpu));
18566 +               per_cpu(posix_timer_task,cpu) = NULL;
18567 +               break;
18568 +       case CPU_DEAD:
18569 +               kthread_stop(per_cpu(posix_timer_task,cpu));
18570 +               per_cpu(posix_timer_task,cpu) = NULL;
18571 +               break;
18572 +#endif
18573 +       }
18574 +       return NOTIFY_OK;
18575 +}
18576 +
18577 +/* Register at highest priority so that task migration (migrate_all_tasks)
18578 + * happens before everything else.
18579 + */
18580 +static struct notifier_block posix_cpu_thread_notifier = {
18581 +       .notifier_call = posix_cpu_thread_call,
18582 +       .priority = 10
18583 +};
18584 +
18585 +static int __init posix_cpu_thread_init(void)
18586 +{
18587 +       void *hcpu = (void *)(long)smp_processor_id();
18588 +       /* Start one for boot CPU. */
18589 +       unsigned long cpu;
18590 +
18591 +       /* init the per-cpu posix_timer_tasklets */
18592 +       for_each_possible_cpu(cpu)
18593 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
18594 +
18595 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
18596 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
18597 +       register_cpu_notifier(&posix_cpu_thread_notifier);
18598 +       return 0;
18599 +}
18600 +early_initcall(posix_cpu_thread_init);
18601 +#else /* CONFIG_PREEMPT_RT_BASE */
18602 +void run_posix_cpu_timers(struct task_struct *tsk)
18603 +{
18604 +       __run_posix_cpu_timers(tsk);
18605 +}
18606 +#endif /* CONFIG_PREEMPT_RT_BASE */
18607 +
18608  /*
18609   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
18610   * The tsk->sighand->siglock must be held by the caller.
18611 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
18612 index f2826c35e918..464a98155a0e 100644
18613 --- a/kernel/time/posix-timers.c
18614 +++ b/kernel/time/posix-timers.c
18615 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
18616  static struct pid *good_sigevent(sigevent_t * event)
18617  {
18618         struct task_struct *rtn = current->group_leader;
18619 +       int sig = event->sigev_signo;
18620
18621         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
18622                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
18623 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
18624                 return NULL;
18625
18626         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
18627 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
18628 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
18629 +            sig_kernel_coredump(sig)))
18630                 return NULL;
18631
18632         return task_pid(rtn);
18633 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
18634         return overrun;
18635  }
18636
18637 +/*
18638 + * Protected by RCU!
18639 + */
18640 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
18641 +{
18642 +#ifdef CONFIG_PREEMPT_RT_FULL
18643 +       if (kc->timer_set == common_timer_set)
18644 +               hrtimer_wait_for_timer(&timr->it.real.timer);
18645 +       else
18646 +               /* FIXME: Whacky hack for posix-cpu-timers */
18647 +               schedule_timeout(1);
18648 +#endif
18649 +}
18650 +
18651  /* Set a POSIX.1b interval timer. */
18652  /* timr->it_lock is taken. */
18653  static int
18654 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
18655         if (!timr)
18656                 return -EINVAL;
18657
18658 +       rcu_read_lock();
18659         kc = clockid_to_kclock(timr->it_clock);
18660         if (WARN_ON_ONCE(!kc || !kc->timer_set))
18661                 error = -EINVAL;
18662 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
18663
18664         unlock_timer(timr, flag);
18665         if (error == TIMER_RETRY) {
18666 +               timer_wait_for_callback(kc, timr);
18667                 rtn = NULL;     // We already got the old time...
18668 +               rcu_read_unlock();
18669                 goto retry;
18670         }
18671 +       rcu_read_unlock();
18672
18673         if (old_setting && !error &&
18674             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
18675 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
18676         if (!timer)
18677                 return -EINVAL;
18678
18679 +       rcu_read_lock();
18680         if (timer_delete_hook(timer) == TIMER_RETRY) {
18681                 unlock_timer(timer, flags);
18682 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
18683 +                                       timer);
18684 +               rcu_read_unlock();
18685                 goto retry_delete;
18686         }
18687 +       rcu_read_unlock();
18688
18689         spin_lock(&current->sighand->siglock);
18690         list_del(&timer->list);
18691 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
18692  retry_delete:
18693         spin_lock_irqsave(&timer->it_lock, flags);
18694
18695 -       if (timer_delete_hook(timer) == TIMER_RETRY) {
18696 +       /* On RT we can race with a deletion */
18697 +       if (!timer->it_signal) {
18698                 unlock_timer(timer, flags);
18699 +               return;
18700 +       }
18701 +
18702 +       if (timer_delete_hook(timer) == TIMER_RETRY) {
18703 +               rcu_read_lock();
18704 +               unlock_timer(timer, flags);
18705 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
18706 +                                       timer);
18707 +               rcu_read_unlock();
18708                 goto retry_delete;
18709         }
18710         list_del(&timer->list);
18711 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
18712 index 690b797f522e..fe8ba1619879 100644
18713 --- a/kernel/time/tick-broadcast-hrtimer.c
18714 +++ b/kernel/time/tick-broadcast-hrtimer.c
18715 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
18716  {
18717         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
18718         bctimer.function = bc_handler;
18719 +       bctimer.irqsafe = true;
18720         clockevents_register_device(&ce_broadcast_hrtimer);
18721  }
18722 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
18723 index 4fcd99e12aa0..5a47f2e98faf 100644
18724 --- a/kernel/time/tick-common.c
18725 +++ b/kernel/time/tick-common.c
18726 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
18727  static void tick_periodic(int cpu)
18728  {
18729         if (tick_do_timer_cpu == cpu) {
18730 -               write_seqlock(&jiffies_lock);
18731 +               raw_spin_lock(&jiffies_lock);
18732 +               write_seqcount_begin(&jiffies_seq);
18733
18734                 /* Keep track of the next tick event */
18735                 tick_next_period = ktime_add(tick_next_period, tick_period);
18736
18737                 do_timer(1);
18738 -               write_sequnlock(&jiffies_lock);
18739 +               write_seqcount_end(&jiffies_seq);
18740 +               raw_spin_unlock(&jiffies_lock);
18741                 update_wall_time();
18742         }
18743
18744 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
18745                 ktime_t next;
18746
18747                 do {
18748 -                       seq = read_seqbegin(&jiffies_lock);
18749 +                       seq = read_seqcount_begin(&jiffies_seq);
18750                         next = tick_next_period;
18751 -               } while (read_seqretry(&jiffies_lock, seq));
18752 +               } while (read_seqcount_retry(&jiffies_seq, seq));
18753
18754                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
18755
18756 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
18757 index 3bcb61b52f6c..66d85482a96e 100644
18758 --- a/kernel/time/tick-sched.c
18759 +++ b/kernel/time/tick-sched.c
18760 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
18761                 return;
18762
18763         /* Reevaluate with jiffies_lock held */
18764 -       write_seqlock(&jiffies_lock);
18765 +       raw_spin_lock(&jiffies_lock);
18766 +       write_seqcount_begin(&jiffies_seq);
18767
18768         delta = ktime_sub(now, last_jiffies_update);
18769         if (delta.tv64 >= tick_period.tv64) {
18770 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
18771                 /* Keep the tick_next_period variable up to date */
18772                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
18773         } else {
18774 -               write_sequnlock(&jiffies_lock);
18775 +               write_seqcount_end(&jiffies_seq);
18776 +               raw_spin_unlock(&jiffies_lock);
18777                 return;
18778         }
18779 -       write_sequnlock(&jiffies_lock);
18780 +       write_seqcount_end(&jiffies_seq);
18781 +       raw_spin_unlock(&jiffies_lock);
18782         update_wall_time();
18783  }
18784
18785 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
18786  {
18787         ktime_t period;
18788
18789 -       write_seqlock(&jiffies_lock);
18790 +       raw_spin_lock(&jiffies_lock);
18791 +       write_seqcount_begin(&jiffies_seq);
18792         /* Did we start the jiffies update yet ? */
18793         if (last_jiffies_update.tv64 == 0)
18794                 last_jiffies_update = tick_next_period;
18795         period = last_jiffies_update;
18796 -       write_sequnlock(&jiffies_lock);
18797 +       write_seqcount_end(&jiffies_seq);
18798 +       raw_spin_unlock(&jiffies_lock);
18799         return period;
18800  }
18801
18802 @@ -215,6 +220,7 @@ static void nohz_full_kick_func(struct irq_work *work)
18803
18804  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
18805         .func = nohz_full_kick_func,
18806 +       .flags = IRQ_WORK_HARD_IRQ,
18807  };
18808
18809  /*
18810 @@ -673,10 +679,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
18811
18812         /* Read jiffies and the time when jiffies were updated last */
18813         do {
18814 -               seq = read_seqbegin(&jiffies_lock);
18815 +               seq = read_seqcount_begin(&jiffies_seq);
18816                 basemono = last_jiffies_update.tv64;
18817                 basejiff = jiffies;
18818 -       } while (read_seqretry(&jiffies_lock, seq));
18819 +       } while (read_seqcount_retry(&jiffies_seq, seq));
18820         ts->last_jiffies = basejiff;
18821
18822         if (rcu_needs_cpu(basemono, &next_rcu) ||
18823 @@ -877,14 +883,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
18824                 return false;
18825
18826         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
18827 -               static int ratelimit;
18828 -
18829 -               if (ratelimit < 10 &&
18830 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18831 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
18832 -                               (unsigned int) local_softirq_pending());
18833 -                       ratelimit++;
18834 -               }
18835 +               softirq_check_pending_idle();
18836                 return false;
18837         }
18838
18839 @@ -1193,6 +1192,7 @@ void tick_setup_sched_timer(void)
18840          * Emulate tick processing via per-CPU hrtimers:
18841          */
18842         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
18843 +       ts->sched_timer.irqsafe = 1;
18844         ts->sched_timer.function = tick_sched_timer;
18845
18846         /* Get the next period (per-CPU) */
18847 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
18848 index 46e312e9be38..fa75cf5d9253 100644
18849 --- a/kernel/time/timekeeping.c
18850 +++ b/kernel/time/timekeeping.c
18851 @@ -2328,8 +2328,10 @@ EXPORT_SYMBOL(hardpps);
18852   */
18853  void xtime_update(unsigned long ticks)
18854  {
18855 -       write_seqlock(&jiffies_lock);
18856 +       raw_spin_lock(&jiffies_lock);
18857 +       write_seqcount_begin(&jiffies_seq);
18858         do_timer(ticks);
18859 -       write_sequnlock(&jiffies_lock);
18860 +       write_seqcount_end(&jiffies_seq);
18861 +       raw_spin_unlock(&jiffies_lock);
18862         update_wall_time();
18863  }
18864 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
18865 index 704f595ce83f..763a3e5121ff 100644
18866 --- a/kernel/time/timekeeping.h
18867 +++ b/kernel/time/timekeeping.h
18868 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
18869  extern void do_timer(unsigned long ticks);
18870  extern void update_wall_time(void);
18871
18872 -extern seqlock_t jiffies_lock;
18873 +extern raw_spinlock_t jiffies_lock;
18874 +extern seqcount_t jiffies_seq;
18875
18876  #define CS_NAME_LEN    32
18877
18878 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
18879 index c611c47de884..08a5ab762495 100644
18880 --- a/kernel/time/timer.c
18881 +++ b/kernel/time/timer.c
18882 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
18883  #endif
18884
18885  struct timer_base {
18886 -       spinlock_t              lock;
18887 +       raw_spinlock_t          lock;
18888         struct timer_list       *running_timer;
18889 +#ifdef CONFIG_PREEMPT_RT_FULL
18890 +       struct swait_queue_head wait_for_running_timer;
18891 +#endif
18892         unsigned long           clk;
18893         unsigned long           next_expiry;
18894         unsigned int            cpu;
18895 @@ -948,10 +951,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
18896
18897                 if (!(tf & TIMER_MIGRATING)) {
18898                         base = get_timer_base(tf);
18899 -                       spin_lock_irqsave(&base->lock, *flags);
18900 +                       raw_spin_lock_irqsave(&base->lock, *flags);
18901                         if (timer->flags == tf)
18902                                 return base;
18903 -                       spin_unlock_irqrestore(&base->lock, *flags);
18904 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
18905                 }
18906                 cpu_relax();
18907         }
18908 @@ -1023,9 +1026,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
18909                         /* See the comment in lock_timer_base() */
18910                         timer->flags |= TIMER_MIGRATING;
18911
18912 -                       spin_unlock(&base->lock);
18913 +                       raw_spin_unlock(&base->lock);
18914                         base = new_base;
18915 -                       spin_lock(&base->lock);
18916 +                       raw_spin_lock(&base->lock);
18917                         WRITE_ONCE(timer->flags,
18918                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
18919                 }
18920 @@ -1050,7 +1053,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
18921         }
18922
18923  out_unlock:
18924 -       spin_unlock_irqrestore(&base->lock, flags);
18925 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18926
18927         return ret;
18928  }
18929 @@ -1144,19 +1147,46 @@ void add_timer_on(struct timer_list *timer, int cpu)
18930         if (base != new_base) {
18931                 timer->flags |= TIMER_MIGRATING;
18932
18933 -               spin_unlock(&base->lock);
18934 +               raw_spin_unlock(&base->lock);
18935                 base = new_base;
18936 -               spin_lock(&base->lock);
18937 +               raw_spin_lock(&base->lock);
18938                 WRITE_ONCE(timer->flags,
18939                            (timer->flags & ~TIMER_BASEMASK) | cpu);
18940         }
18941
18942         debug_activate(timer, timer->expires);
18943         internal_add_timer(base, timer);
18944 -       spin_unlock_irqrestore(&base->lock, flags);
18945 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18946  }
18947  EXPORT_SYMBOL_GPL(add_timer_on);
18948
18949 +#ifdef CONFIG_PREEMPT_RT_FULL
18950 +/*
18951 + * Wait for a running timer
18952 + */
18953 +static void wait_for_running_timer(struct timer_list *timer)
18954 +{
18955 +       struct timer_base *base;
18956 +       u32 tf = timer->flags;
18957 +
18958 +       if (tf & TIMER_MIGRATING)
18959 +               return;
18960 +
18961 +       base = get_timer_base(tf);
18962 +       swait_event(base->wait_for_running_timer,
18963 +                  base->running_timer != timer);
18964 +}
18965 +
18966 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
18967 +#else
18968 +static inline void wait_for_running_timer(struct timer_list *timer)
18969 +{
18970 +       cpu_relax();
18971 +}
18972 +
18973 +# define wakeup_timer_waiters(b)       do { } while (0)
18974 +#endif
18975 +
18976  /**
18977   * del_timer - deactive a timer.
18978   * @timer: the timer to be deactivated
18979 @@ -1180,7 +1210,7 @@ int del_timer(struct timer_list *timer)
18980         if (timer_pending(timer)) {
18981                 base = lock_timer_base(timer, &flags);
18982                 ret = detach_if_pending(timer, base, true);
18983 -               spin_unlock_irqrestore(&base->lock, flags);
18984 +               raw_spin_unlock_irqrestore(&base->lock, flags);
18985         }
18986
18987         return ret;
18988 @@ -1208,13 +1238,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
18989                 timer_stats_timer_clear_start_info(timer);
18990                 ret = detach_if_pending(timer, base, true);
18991         }
18992 -       spin_unlock_irqrestore(&base->lock, flags);
18993 +       raw_spin_unlock_irqrestore(&base->lock, flags);
18994
18995         return ret;
18996  }
18997  EXPORT_SYMBOL(try_to_del_timer_sync);
18998
18999 -#ifdef CONFIG_SMP
19000 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
19001  /**
19002   * del_timer_sync - deactivate a timer and wait for the handler to finish.
19003   * @timer: the timer to be deactivated
19004 @@ -1274,7 +1304,7 @@ int del_timer_sync(struct timer_list *timer)
19005                 int ret = try_to_del_timer_sync(timer);
19006                 if (ret >= 0)
19007                         return ret;
19008 -               cpu_relax();
19009 +               wait_for_running_timer(timer);
19010         }
19011  }
19012  EXPORT_SYMBOL(del_timer_sync);
19013 @@ -1339,14 +1369,17 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
19014                 fn = timer->function;
19015                 data = timer->data;
19016
19017 -               if (timer->flags & TIMER_IRQSAFE) {
19018 -                       spin_unlock(&base->lock);
19019 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
19020 +                   timer->flags & TIMER_IRQSAFE) {
19021 +                       raw_spin_unlock(&base->lock);
19022                         call_timer_fn(timer, fn, data);
19023 -                       spin_lock(&base->lock);
19024 +                       base->running_timer = NULL;
19025 +                       raw_spin_lock(&base->lock);
19026                 } else {
19027 -                       spin_unlock_irq(&base->lock);
19028 +                       raw_spin_unlock_irq(&base->lock);
19029                         call_timer_fn(timer, fn, data);
19030 -                       spin_lock_irq(&base->lock);
19031 +                       base->running_timer = NULL;
19032 +                       raw_spin_lock_irq(&base->lock);
19033                 }
19034         }
19035  }
19036 @@ -1515,7 +1548,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
19037         if (cpu_is_offline(smp_processor_id()))
19038                 return expires;
19039
19040 -       spin_lock(&base->lock);
19041 +       raw_spin_lock(&base->lock);
19042         nextevt = __next_timer_interrupt(base);
19043         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
19044         base->next_expiry = nextevt;
19045 @@ -1543,7 +1576,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
19046                 if ((expires - basem) > TICK_NSEC)
19047                         base->is_idle = true;
19048         }
19049 -       spin_unlock(&base->lock);
19050 +       raw_spin_unlock(&base->lock);
19051
19052         return cmp_next_hrtimer_event(basem, expires);
19053  }
19054 @@ -1608,13 +1641,13 @@ void update_process_times(int user_tick)
19055
19056         /* Note: this timer irq context must be accounted for as well. */
19057         account_process_tick(p, user_tick);
19058 +       scheduler_tick();
19059         run_local_timers();
19060         rcu_check_callbacks(user_tick);
19061 -#ifdef CONFIG_IRQ_WORK
19062 +#if defined(CONFIG_IRQ_WORK)
19063         if (in_irq())
19064                 irq_work_tick();
19065  #endif
19066 -       scheduler_tick();
19067         run_posix_cpu_timers(p);
19068  }
19069
19070 @@ -1630,7 +1663,7 @@ static inline void __run_timers(struct timer_base *base)
19071         if (!time_after_eq(jiffies, base->clk))
19072                 return;
19073
19074 -       spin_lock_irq(&base->lock);
19075 +       raw_spin_lock_irq(&base->lock);
19076
19077         while (time_after_eq(jiffies, base->clk)) {
19078
19079 @@ -1640,8 +1673,8 @@ static inline void __run_timers(struct timer_base *base)
19080                 while (levels--)
19081                         expire_timers(base, heads + levels);
19082         }
19083 -       base->running_timer = NULL;
19084 -       spin_unlock_irq(&base->lock);
19085 +       raw_spin_unlock_irq(&base->lock);
19086 +       wakeup_timer_waiters(base);
19087  }
19088
19089  /*
19090 @@ -1651,6 +1684,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
19091  {
19092         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
19093
19094 +       irq_work_tick_soft();
19095 +
19096         __run_timers(base);
19097         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
19098                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
19099 @@ -1836,16 +1871,16 @@ int timers_dead_cpu(unsigned int cpu)
19100                  * The caller is globally serialized and nobody else
19101                  * takes two locks at once, deadlock is not possible.
19102                  */
19103 -               spin_lock_irq(&new_base->lock);
19104 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
19105 +               raw_spin_lock_irq(&new_base->lock);
19106 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
19107
19108                 BUG_ON(old_base->running_timer);
19109
19110                 for (i = 0; i < WHEEL_SIZE; i++)
19111                         migrate_timer_list(new_base, old_base->vectors + i);
19112
19113 -               spin_unlock(&old_base->lock);
19114 -               spin_unlock_irq(&new_base->lock);
19115 +               raw_spin_unlock(&old_base->lock);
19116 +               raw_spin_unlock_irq(&new_base->lock);
19117                 put_cpu_ptr(&timer_bases);
19118         }
19119         return 0;
19120 @@ -1861,8 +1896,11 @@ static void __init init_timer_cpu(int cpu)
19121         for (i = 0; i < NR_BASES; i++) {
19122                 base = per_cpu_ptr(&timer_bases[i], cpu);
19123                 base->cpu = cpu;
19124 -               spin_lock_init(&base->lock);
19125 +               raw_spin_lock_init(&base->lock);
19126                 base->clk = jiffies;
19127 +#ifdef CONFIG_PREEMPT_RT_FULL
19128 +               init_swait_queue_head(&base->wait_for_running_timer);
19129 +#endif
19130         }
19131  }
19132
19133 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
19134 index 2a96b063d659..812e37237eb8 100644
19135 --- a/kernel/trace/Kconfig
19136 +++ b/kernel/trace/Kconfig
19137 @@ -182,6 +182,24 @@ config IRQSOFF_TRACER
19138           enabled. This option and the preempt-off timing option can be
19139           used together or separately.)
19140
19141 +config INTERRUPT_OFF_HIST
19142 +       bool "Interrupts-off Latency Histogram"
19143 +       depends on IRQSOFF_TRACER
19144 +       help
19145 +         This option generates continuously updated histograms (one per cpu)
19146 +         of the duration of time periods with interrupts disabled. The
19147 +         histograms are disabled by default. To enable them, write a non-zero
19148 +         number to
19149 +
19150 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
19151 +
19152 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
19153 +         per cpu) are generated that accumulate the duration of time periods
19154 +         when both interrupts and preemption are disabled. The histogram data
19155 +         will be located in the debug file system at
19156 +
19157 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
19158 +
19159  config PREEMPT_TRACER
19160         bool "Preemption-off Latency Tracer"
19161         default n
19162 @@ -206,6 +224,24 @@ config PREEMPT_TRACER
19163           enabled. This option and the irqs-off timing option can be
19164           used together or separately.)
19165
19166 +config PREEMPT_OFF_HIST
19167 +       bool "Preemption-off Latency Histogram"
19168 +       depends on PREEMPT_TRACER
19169 +       help
19170 +         This option generates continuously updated histograms (one per cpu)
19171 +         of the duration of time periods with preemption disabled. The
19172 +         histograms are disabled by default. To enable them, write a non-zero
19173 +         number to
19174 +
19175 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
19176 +
19177 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
19178 +         per cpu) are generated that accumulate the duration of time periods
19179 +         when both interrupts and preemption are disabled. The histogram data
19180 +         will be located in the debug file system at
19181 +
19182 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
19183 +
19184  config SCHED_TRACER
19185         bool "Scheduling Latency Tracer"
19186         select GENERIC_TRACER
19187 @@ -251,6 +287,74 @@ config HWLAT_TRACER
19188          file. Every time a latency is greater than tracing_thresh, it will
19189          be recorded into the ring buffer.
19190
19191 +config WAKEUP_LATENCY_HIST
19192 +       bool "Scheduling Latency Histogram"
19193 +       depends on SCHED_TRACER
19194 +       help
19195 +         This option generates continuously updated histograms (one per cpu)
19196 +         of the scheduling latency of the highest priority task.
19197 +         The histograms are disabled by default. To enable them, write a
19198 +         non-zero number to
19199 +
19200 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
19201 +
19202 +         Two different algorithms are used, one to determine the latency of
19203 +         processes that exclusively use the highest priority of the system and
19204 +         another one to determine the latency of processes that share the
19205 +         highest system priority with other processes. The former is used to
19206 +         improve hardware and system software, the latter to optimize the
19207 +         priority design of a given system. The histogram data will be
19208 +         located in the debug file system at
19209 +
19210 +             /sys/kernel/debug/tracing/latency_hist/wakeup
19211 +
19212 +         and
19213 +
19214 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
19215 +
19216 +         If both Scheduling Latency Histogram and Missed Timer Offsets
19217 +         Histogram are selected, additional histogram data will be collected
19218 +         that contain, in addition to the wakeup latency, the timer latency, in
19219 +         case the wakeup was triggered by an expired timer. These histograms
19220 +         are available in the
19221 +
19222 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
19223 +
19224 +         directory. They reflect the apparent interrupt and scheduling latency
19225 +         and are best suitable to determine the worst-case latency of a given
19226 +         system. To enable these histograms, write a non-zero number to
19227 +
19228 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
19229 +
19230 +config MISSED_TIMER_OFFSETS_HIST
19231 +       depends on HIGH_RES_TIMERS
19232 +       select GENERIC_TRACER
19233 +       bool "Missed Timer Offsets Histogram"
19234 +       help
19235 +         Generate a histogram of missed timer offsets in microseconds. The
19236 +         histograms are disabled by default. To enable them, write a non-zero
19237 +         number to
19238 +
19239 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
19240 +
19241 +         The histogram data will be located in the debug file system at
19242 +
19243 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
19244 +
19245 +         If both Scheduling Latency Histogram and Missed Timer Offsets
19246 +         Histogram are selected, additional histogram data will be collected
19247 +         that contain, in addition to the wakeup latency, the timer latency, in
19248 +         case the wakeup was triggered by an expired timer. These histograms
19249 +         are available in the
19250 +
19251 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
19252 +
19253 +         directory. They reflect the apparent interrupt and scheduling latency
19254 +         and are best suitable to determine the worst-case latency of a given
19255 +         system. To enable these histograms, write a non-zero number to
19256 +
19257 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
19258 +
19259  config ENABLE_DEFAULT_TRACERS
19260         bool "Trace process context switches and events"
19261         depends on !GENERIC_TRACER
19262 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
19263 index e57980845549..83af000b783c 100644
19264 --- a/kernel/trace/Makefile
19265 +++ b/kernel/trace/Makefile
19266 @@ -38,6 +38,10 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
19267  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
19268  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
19269  obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
19270 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
19271 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
19272 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
19273 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
19274  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
19275  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
19276  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
19277 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
19278 new file mode 100644
19279 index 000000000000..7f6ee70dea41
19280 --- /dev/null
19281 +++ b/kernel/trace/latency_hist.c
19282 @@ -0,0 +1,1178 @@
19283 +/*
19284 + * kernel/trace/latency_hist.c
19285 + *
19286 + * Add support for histograms of preemption-off latency and
19287 + * interrupt-off latency and wakeup latency, it depends on
19288 + * Real-Time Preemption Support.
19289 + *
19290 + *  Copyright (C) 2005 MontaVista Software, Inc.
19291 + *  Yi Yang <yyang@ch.mvista.com>
19292 + *
19293 + *  Converted to work with the new latency tracer.
19294 + *  Copyright (C) 2008 Red Hat, Inc.
19295 + *    Steven Rostedt <srostedt@redhat.com>
19296 + *
19297 + */
19298 +#include <linux/module.h>
19299 +#include <linux/debugfs.h>
19300 +#include <linux/seq_file.h>
19301 +#include <linux/percpu.h>
19302 +#include <linux/kallsyms.h>
19303 +#include <linux/uaccess.h>
19304 +#include <linux/sched.h>
19305 +#include <linux/sched/rt.h>
19306 +#include <linux/slab.h>
19307 +#include <linux/atomic.h>
19308 +#include <asm/div64.h>
19309 +
19310 +#include "trace.h"
19311 +#include <trace/events/sched.h>
19312 +
19313 +#define NSECS_PER_USECS 1000L
19314 +
19315 +#define CREATE_TRACE_POINTS
19316 +#include <trace/events/hist.h>
19317 +
19318 +enum {
19319 +       IRQSOFF_LATENCY = 0,
19320 +       PREEMPTOFF_LATENCY,
19321 +       PREEMPTIRQSOFF_LATENCY,
19322 +       WAKEUP_LATENCY,
19323 +       WAKEUP_LATENCY_SHAREDPRIO,
19324 +       MISSED_TIMER_OFFSETS,
19325 +       TIMERANDWAKEUP_LATENCY,
19326 +       MAX_LATENCY_TYPE,
19327 +};
19328 +
19329 +#define MAX_ENTRY_NUM 10240
19330 +
19331 +struct hist_data {
19332 +       atomic_t hist_mode; /* 0 log, 1 don't log */
19333 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
19334 +       long min_lat;
19335 +       long max_lat;
19336 +       unsigned long long below_hist_bound_samples;
19337 +       unsigned long long above_hist_bound_samples;
19338 +       long long accumulate_lat;
19339 +       unsigned long long total_samples;
19340 +       unsigned long long hist_array[MAX_ENTRY_NUM];
19341 +};
19342 +
19343 +struct enable_data {
19344 +       int latency_type;
19345 +       int enabled;
19346 +};
19347 +
19348 +static char *latency_hist_dir_root = "latency_hist";
19349 +
19350 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19351 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
19352 +static char *irqsoff_hist_dir = "irqsoff";
19353 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
19354 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
19355 +#endif
19356 +
19357 +#ifdef CONFIG_PREEMPT_OFF_HIST
19358 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
19359 +static char *preemptoff_hist_dir = "preemptoff";
19360 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
19361 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
19362 +#endif
19363 +
19364 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
19365 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
19366 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
19367 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
19368 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
19369 +#endif
19370 +
19371 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
19372 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
19373 +static struct enable_data preemptirqsoff_enabled_data = {
19374 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
19375 +       .enabled = 0,
19376 +};
19377 +#endif
19378 +
19379 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19380 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19381 +struct maxlatproc_data {
19382 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
19383 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
19384 +       int pid;
19385 +       int current_pid;
19386 +       int prio;
19387 +       int current_prio;
19388 +       long latency;
19389 +       long timeroffset;
19390 +       cycle_t timestamp;
19391 +};
19392 +#endif
19393 +
19394 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19395 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
19396 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
19397 +static char *wakeup_latency_hist_dir = "wakeup";
19398 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
19399 +static notrace void probe_wakeup_latency_hist_start(void *v,
19400 +       struct task_struct *p);
19401 +static notrace void probe_wakeup_latency_hist_stop(void *v,
19402 +       bool preempt, struct task_struct *prev, struct task_struct *next);
19403 +static notrace void probe_sched_migrate_task(void *,
19404 +       struct task_struct *task, int cpu);
19405 +static struct enable_data wakeup_latency_enabled_data = {
19406 +       .latency_type = WAKEUP_LATENCY,
19407 +       .enabled = 0,
19408 +};
19409 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
19410 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
19411 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
19412 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
19413 +static unsigned long wakeup_pid;
19414 +#endif
19415 +
19416 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19417 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
19418 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
19419 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
19420 +       long long offset, struct task_struct *curr, struct task_struct *task);
19421 +static struct enable_data missed_timer_offsets_enabled_data = {
19422 +       .latency_type = MISSED_TIMER_OFFSETS,
19423 +       .enabled = 0,
19424 +};
19425 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
19426 +static unsigned long missed_timer_offsets_pid;
19427 +#endif
19428 +
19429 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19430 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19431 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
19432 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
19433 +static struct enable_data timerandwakeup_enabled_data = {
19434 +       .latency_type = TIMERANDWAKEUP_LATENCY,
19435 +       .enabled = 0,
19436 +};
19437 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
19438 +#endif
19439 +
19440 +void notrace latency_hist(int latency_type, int cpu, long latency,
19441 +                         long timeroffset, cycle_t stop,
19442 +                         struct task_struct *p)
19443 +{
19444 +       struct hist_data *my_hist;
19445 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19446 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19447 +       struct maxlatproc_data *mp = NULL;
19448 +#endif
19449 +
19450 +       if (!cpu_possible(cpu) || latency_type < 0 ||
19451 +           latency_type >= MAX_LATENCY_TYPE)
19452 +               return;
19453 +
19454 +       switch (latency_type) {
19455 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19456 +       case IRQSOFF_LATENCY:
19457 +               my_hist = &per_cpu(irqsoff_hist, cpu);
19458 +               break;
19459 +#endif
19460 +#ifdef CONFIG_PREEMPT_OFF_HIST
19461 +       case PREEMPTOFF_LATENCY:
19462 +               my_hist = &per_cpu(preemptoff_hist, cpu);
19463 +               break;
19464 +#endif
19465 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
19466 +       case PREEMPTIRQSOFF_LATENCY:
19467 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
19468 +               break;
19469 +#endif
19470 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19471 +       case WAKEUP_LATENCY:
19472 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
19473 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
19474 +               break;
19475 +       case WAKEUP_LATENCY_SHAREDPRIO:
19476 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
19477 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
19478 +               break;
19479 +#endif
19480 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19481 +       case MISSED_TIMER_OFFSETS:
19482 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
19483 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
19484 +               break;
19485 +#endif
19486 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19487 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19488 +       case TIMERANDWAKEUP_LATENCY:
19489 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
19490 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
19491 +               break;
19492 +#endif
19493 +
19494 +       default:
19495 +               return;
19496 +       }
19497 +
19498 +       latency += my_hist->offset;
19499 +
19500 +       if (atomic_read(&my_hist->hist_mode) == 0)
19501 +               return;
19502 +
19503 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
19504 +               if (latency < 0)
19505 +                       my_hist->below_hist_bound_samples++;
19506 +               else
19507 +                       my_hist->above_hist_bound_samples++;
19508 +       } else
19509 +               my_hist->hist_array[latency]++;
19510 +
19511 +       if (unlikely(latency > my_hist->max_lat ||
19512 +           my_hist->min_lat == LONG_MAX)) {
19513 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19514 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19515 +               if (latency_type == WAKEUP_LATENCY ||
19516 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
19517 +                   latency_type == MISSED_TIMER_OFFSETS ||
19518 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
19519 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
19520 +                       strncpy(mp->current_comm, current->comm,
19521 +                           sizeof(mp->current_comm));
19522 +                       mp->pid = task_pid_nr(p);
19523 +                       mp->current_pid = task_pid_nr(current);
19524 +                       mp->prio = p->prio;
19525 +                       mp->current_prio = current->prio;
19526 +                       mp->latency = latency;
19527 +                       mp->timeroffset = timeroffset;
19528 +                       mp->timestamp = stop;
19529 +               }
19530 +#endif
19531 +               my_hist->max_lat = latency;
19532 +       }
19533 +       if (unlikely(latency < my_hist->min_lat))
19534 +               my_hist->min_lat = latency;
19535 +       my_hist->total_samples++;
19536 +       my_hist->accumulate_lat += latency;
19537 +}
19538 +
19539 +static void *l_start(struct seq_file *m, loff_t *pos)
19540 +{
19541 +       loff_t *index_ptr = NULL;
19542 +       loff_t index = *pos;
19543 +       struct hist_data *my_hist = m->private;
19544 +
19545 +       if (index == 0) {
19546 +               char minstr[32], avgstr[32], maxstr[32];
19547 +
19548 +               atomic_dec(&my_hist->hist_mode);
19549 +
19550 +               if (likely(my_hist->total_samples)) {
19551 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
19552 +                           my_hist->total_samples);
19553 +                       snprintf(minstr, sizeof(minstr), "%ld",
19554 +                           my_hist->min_lat - my_hist->offset);
19555 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
19556 +                           avg - my_hist->offset);
19557 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
19558 +                           my_hist->max_lat - my_hist->offset);
19559 +               } else {
19560 +                       strcpy(minstr, "<undef>");
19561 +                       strcpy(avgstr, minstr);
19562 +                       strcpy(maxstr, minstr);
19563 +               }
19564 +
19565 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
19566 +                          "#Average latency: %s microseconds\n"
19567 +                          "#Maximum latency: %s microseconds\n"
19568 +                          "#Total samples: %llu\n"
19569 +                          "#There are %llu samples lower than %ld"
19570 +                          " microseconds.\n"
19571 +                          "#There are %llu samples greater or equal"
19572 +                          " than %ld microseconds.\n"
19573 +                          "#usecs\t%16s\n",
19574 +                          minstr, avgstr, maxstr,
19575 +                          my_hist->total_samples,
19576 +                          my_hist->below_hist_bound_samples,
19577 +                          -my_hist->offset,
19578 +                          my_hist->above_hist_bound_samples,
19579 +                          MAX_ENTRY_NUM - my_hist->offset,
19580 +                          "samples");
19581 +       }
19582 +       if (index < MAX_ENTRY_NUM) {
19583 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
19584 +               if (index_ptr)
19585 +                       *index_ptr = index;
19586 +       }
19587 +
19588 +       return index_ptr;
19589 +}
19590 +
19591 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
19592 +{
19593 +       loff_t *index_ptr = p;
19594 +       struct hist_data *my_hist = m->private;
19595 +
19596 +       if (++*pos >= MAX_ENTRY_NUM) {
19597 +               atomic_inc(&my_hist->hist_mode);
19598 +               return NULL;
19599 +       }
19600 +       *index_ptr = *pos;
19601 +       return index_ptr;
19602 +}
19603 +
19604 +static void l_stop(struct seq_file *m, void *p)
19605 +{
19606 +       kfree(p);
19607 +}
19608 +
19609 +static int l_show(struct seq_file *m, void *p)
19610 +{
19611 +       int index = *(loff_t *) p;
19612 +       struct hist_data *my_hist = m->private;
19613 +
19614 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
19615 +           my_hist->hist_array[index]);
19616 +       return 0;
19617 +}
19618 +
19619 +static const struct seq_operations latency_hist_seq_op = {
19620 +       .start = l_start,
19621 +       .next  = l_next,
19622 +       .stop  = l_stop,
19623 +       .show  = l_show
19624 +};
19625 +
19626 +static int latency_hist_open(struct inode *inode, struct file *file)
19627 +{
19628 +       int ret;
19629 +
19630 +       ret = seq_open(file, &latency_hist_seq_op);
19631 +       if (!ret) {
19632 +               struct seq_file *seq = file->private_data;
19633 +               seq->private = inode->i_private;
19634 +       }
19635 +       return ret;
19636 +}
19637 +
19638 +static const struct file_operations latency_hist_fops = {
19639 +       .open = latency_hist_open,
19640 +       .read = seq_read,
19641 +       .llseek = seq_lseek,
19642 +       .release = seq_release,
19643 +};
19644 +
19645 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19646 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19647 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
19648 +{
19649 +       mp->comm[0] = mp->current_comm[0] = '\0';
19650 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
19651 +           mp->latency = mp->timeroffset = -1;
19652 +       mp->timestamp = 0;
19653 +}
19654 +#endif
19655 +
19656 +static void hist_reset(struct hist_data *hist)
19657 +{
19658 +       atomic_dec(&hist->hist_mode);
19659 +
19660 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
19661 +       hist->below_hist_bound_samples = 0ULL;
19662 +       hist->above_hist_bound_samples = 0ULL;
19663 +       hist->min_lat = LONG_MAX;
19664 +       hist->max_lat = LONG_MIN;
19665 +       hist->total_samples = 0ULL;
19666 +       hist->accumulate_lat = 0LL;
19667 +
19668 +       atomic_inc(&hist->hist_mode);
19669 +}
19670 +
19671 +static ssize_t
19672 +latency_hist_reset(struct file *file, const char __user *a,
19673 +                  size_t size, loff_t *off)
19674 +{
19675 +       int cpu;
19676 +       struct hist_data *hist = NULL;
19677 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19678 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19679 +       struct maxlatproc_data *mp = NULL;
19680 +#endif
19681 +       off_t latency_type = (off_t) file->private_data;
19682 +
19683 +       for_each_online_cpu(cpu) {
19684 +
19685 +               switch (latency_type) {
19686 +#ifdef CONFIG_PREEMPT_OFF_HIST
19687 +               case PREEMPTOFF_LATENCY:
19688 +                       hist = &per_cpu(preemptoff_hist, cpu);
19689 +                       break;
19690 +#endif
19691 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19692 +               case IRQSOFF_LATENCY:
19693 +                       hist = &per_cpu(irqsoff_hist, cpu);
19694 +                       break;
19695 +#endif
19696 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19697 +               case PREEMPTIRQSOFF_LATENCY:
19698 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
19699 +                       break;
19700 +#endif
19701 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19702 +               case WAKEUP_LATENCY:
19703 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
19704 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
19705 +                       break;
19706 +               case WAKEUP_LATENCY_SHAREDPRIO:
19707 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
19708 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
19709 +                       break;
19710 +#endif
19711 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19712 +               case MISSED_TIMER_OFFSETS:
19713 +                       hist = &per_cpu(missed_timer_offsets, cpu);
19714 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
19715 +                       break;
19716 +#endif
19717 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19718 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19719 +               case TIMERANDWAKEUP_LATENCY:
19720 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
19721 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
19722 +                       break;
19723 +#endif
19724 +               }
19725 +
19726 +               hist_reset(hist);
19727 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19728 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19729 +               if (latency_type == WAKEUP_LATENCY ||
19730 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
19731 +                   latency_type == MISSED_TIMER_OFFSETS ||
19732 +                   latency_type == TIMERANDWAKEUP_LATENCY)
19733 +                       clear_maxlatprocdata(mp);
19734 +#endif
19735 +       }
19736 +
19737 +       return size;
19738 +}
19739 +
19740 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19741 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19742 +static ssize_t
19743 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19744 +{
19745 +       char buf[64];
19746 +       int r;
19747 +       unsigned long *this_pid = file->private_data;
19748 +
19749 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
19750 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19751 +}
19752 +
19753 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
19754 +                     size_t cnt, loff_t *ppos)
19755 +{
19756 +       char buf[64];
19757 +       unsigned long pid;
19758 +       unsigned long *this_pid = file->private_data;
19759 +
19760 +       if (cnt >= sizeof(buf))
19761 +               return -EINVAL;
19762 +
19763 +       if (copy_from_user(&buf, ubuf, cnt))
19764 +               return -EFAULT;
19765 +
19766 +       buf[cnt] = '\0';
19767 +
19768 +       if (kstrtoul(buf, 10, &pid))
19769 +               return -EINVAL;
19770 +
19771 +       *this_pid = pid;
19772 +
19773 +       return cnt;
19774 +}
19775 +#endif
19776 +
19777 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
19778 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19779 +static ssize_t
19780 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19781 +{
19782 +       int r;
19783 +       struct maxlatproc_data *mp = file->private_data;
19784 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
19785 +       unsigned long long t;
19786 +       unsigned long usecs, secs;
19787 +       char *buf;
19788 +
19789 +       if (mp->pid == -1 || mp->current_pid == -1) {
19790 +               buf = "(none)\n";
19791 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
19792 +                   strlen(buf));
19793 +       }
19794 +
19795 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
19796 +       if (buf == NULL)
19797 +               return -ENOMEM;
19798 +
19799 +       t = ns2usecs(mp->timestamp);
19800 +       usecs = do_div(t, USEC_PER_SEC);
19801 +       secs = (unsigned long) t;
19802 +       r = snprintf(buf, strmaxlen,
19803 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
19804 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
19805 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
19806 +           secs, usecs);
19807 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19808 +       kfree(buf);
19809 +       return r;
19810 +}
19811 +#endif
19812 +
19813 +static ssize_t
19814 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
19815 +{
19816 +       char buf[64];
19817 +       struct enable_data *ed = file->private_data;
19818 +       int r;
19819 +
19820 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
19821 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
19822 +}
19823 +
19824 +static ssize_t
19825 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
19826 +{
19827 +       char buf[64];
19828 +       long enable;
19829 +       struct enable_data *ed = file->private_data;
19830 +
19831 +       if (cnt >= sizeof(buf))
19832 +               return -EINVAL;
19833 +
19834 +       if (copy_from_user(&buf, ubuf, cnt))
19835 +               return -EFAULT;
19836 +
19837 +       buf[cnt] = 0;
19838 +
19839 +       if (kstrtoul(buf, 10, &enable))
19840 +               return -EINVAL;
19841 +
19842 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
19843 +               return cnt;
19844 +
19845 +       if (enable) {
19846 +               int ret;
19847 +
19848 +               switch (ed->latency_type) {
19849 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19850 +               case PREEMPTIRQSOFF_LATENCY:
19851 +                       ret = register_trace_preemptirqsoff_hist(
19852 +                           probe_preemptirqsoff_hist, NULL);
19853 +                       if (ret) {
19854 +                               pr_info("wakeup trace: Couldn't assign "
19855 +                                   "probe_preemptirqsoff_hist "
19856 +                                   "to trace_preemptirqsoff_hist\n");
19857 +                               return ret;
19858 +                       }
19859 +                       break;
19860 +#endif
19861 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19862 +               case WAKEUP_LATENCY:
19863 +                       ret = register_trace_sched_wakeup(
19864 +                           probe_wakeup_latency_hist_start, NULL);
19865 +                       if (ret) {
19866 +                               pr_info("wakeup trace: Couldn't assign "
19867 +                                   "probe_wakeup_latency_hist_start "
19868 +                                   "to trace_sched_wakeup\n");
19869 +                               return ret;
19870 +                       }
19871 +                       ret = register_trace_sched_wakeup_new(
19872 +                           probe_wakeup_latency_hist_start, NULL);
19873 +                       if (ret) {
19874 +                               pr_info("wakeup trace: Couldn't assign "
19875 +                                   "probe_wakeup_latency_hist_start "
19876 +                                   "to trace_sched_wakeup_new\n");
19877 +                               unregister_trace_sched_wakeup(
19878 +                                   probe_wakeup_latency_hist_start, NULL);
19879 +                               return ret;
19880 +                       }
19881 +                       ret = register_trace_sched_switch(
19882 +                           probe_wakeup_latency_hist_stop, NULL);
19883 +                       if (ret) {
19884 +                               pr_info("wakeup trace: Couldn't assign "
19885 +                                   "probe_wakeup_latency_hist_stop "
19886 +                                   "to trace_sched_switch\n");
19887 +                               unregister_trace_sched_wakeup(
19888 +                                   probe_wakeup_latency_hist_start, NULL);
19889 +                               unregister_trace_sched_wakeup_new(
19890 +                                   probe_wakeup_latency_hist_start, NULL);
19891 +                               return ret;
19892 +                       }
19893 +                       ret = register_trace_sched_migrate_task(
19894 +                           probe_sched_migrate_task, NULL);
19895 +                       if (ret) {
19896 +                               pr_info("wakeup trace: Couldn't assign "
19897 +                                   "probe_sched_migrate_task "
19898 +                                   "to trace_sched_migrate_task\n");
19899 +                               unregister_trace_sched_wakeup(
19900 +                                   probe_wakeup_latency_hist_start, NULL);
19901 +                               unregister_trace_sched_wakeup_new(
19902 +                                   probe_wakeup_latency_hist_start, NULL);
19903 +                               unregister_trace_sched_switch(
19904 +                                   probe_wakeup_latency_hist_stop, NULL);
19905 +                               return ret;
19906 +                       }
19907 +                       break;
19908 +#endif
19909 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19910 +               case MISSED_TIMER_OFFSETS:
19911 +                       ret = register_trace_hrtimer_interrupt(
19912 +                           probe_hrtimer_interrupt, NULL);
19913 +                       if (ret) {
19914 +                               pr_info("wakeup trace: Couldn't assign "
19915 +                                   "probe_hrtimer_interrupt "
19916 +                                   "to trace_hrtimer_interrupt\n");
19917 +                               return ret;
19918 +                       }
19919 +                       break;
19920 +#endif
19921 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
19922 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
19923 +               case TIMERANDWAKEUP_LATENCY:
19924 +                       if (!wakeup_latency_enabled_data.enabled ||
19925 +                           !missed_timer_offsets_enabled_data.enabled)
19926 +                               return -EINVAL;
19927 +                       break;
19928 +#endif
19929 +               default:
19930 +                       break;
19931 +               }
19932 +       } else {
19933 +               switch (ed->latency_type) {
19934 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
19935 +               case PREEMPTIRQSOFF_LATENCY:
19936 +                       {
19937 +                               int cpu;
19938 +
19939 +                               unregister_trace_preemptirqsoff_hist(
19940 +                                   probe_preemptirqsoff_hist, NULL);
19941 +                               for_each_online_cpu(cpu) {
19942 +#ifdef CONFIG_INTERRUPT_OFF_HIST
19943 +                                       per_cpu(hist_irqsoff_counting,
19944 +                                           cpu) = 0;
19945 +#endif
19946 +#ifdef CONFIG_PREEMPT_OFF_HIST
19947 +                                       per_cpu(hist_preemptoff_counting,
19948 +                                           cpu) = 0;
19949 +#endif
19950 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
19951 +                                       per_cpu(hist_preemptirqsoff_counting,
19952 +                                           cpu) = 0;
19953 +#endif
19954 +                               }
19955 +                       }
19956 +                       break;
19957 +#endif
19958 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19959 +               case WAKEUP_LATENCY:
19960 +                       {
19961 +                               int cpu;
19962 +
19963 +                               unregister_trace_sched_wakeup(
19964 +                                   probe_wakeup_latency_hist_start, NULL);
19965 +                               unregister_trace_sched_wakeup_new(
19966 +                                   probe_wakeup_latency_hist_start, NULL);
19967 +                               unregister_trace_sched_switch(
19968 +                                   probe_wakeup_latency_hist_stop, NULL);
19969 +                               unregister_trace_sched_migrate_task(
19970 +                                   probe_sched_migrate_task, NULL);
19971 +
19972 +                               for_each_online_cpu(cpu) {
19973 +                                       per_cpu(wakeup_task, cpu) = NULL;
19974 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
19975 +                               }
19976 +                       }
19977 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19978 +                       timerandwakeup_enabled_data.enabled = 0;
19979 +#endif
19980 +                       break;
19981 +#endif
19982 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19983 +               case MISSED_TIMER_OFFSETS:
19984 +                       unregister_trace_hrtimer_interrupt(
19985 +                           probe_hrtimer_interrupt, NULL);
19986 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
19987 +                       timerandwakeup_enabled_data.enabled = 0;
19988 +#endif
19989 +                       break;
19990 +#endif
19991 +               default:
19992 +                       break;
19993 +               }
19994 +       }
19995 +       ed->enabled = enable;
19996 +       return cnt;
19997 +}
19998 +
19999 +static const struct file_operations latency_hist_reset_fops = {
20000 +       .open = tracing_open_generic,
20001 +       .write = latency_hist_reset,
20002 +};
20003 +
20004 +static const struct file_operations enable_fops = {
20005 +       .open = tracing_open_generic,
20006 +       .read = show_enable,
20007 +       .write = do_enable,
20008 +};
20009 +
20010 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20011 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20012 +static const struct file_operations pid_fops = {
20013 +       .open = tracing_open_generic,
20014 +       .read = show_pid,
20015 +       .write = do_pid,
20016 +};
20017 +
20018 +static const struct file_operations maxlatproc_fops = {
20019 +       .open = tracing_open_generic,
20020 +       .read = show_maxlatproc,
20021 +};
20022 +#endif
20023 +
20024 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
20025 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
20026 +       int starthist)
20027 +{
20028 +       int cpu = raw_smp_processor_id();
20029 +       int time_set = 0;
20030 +
20031 +       if (starthist) {
20032 +               cycle_t uninitialized_var(start);
20033 +
20034 +               if (!preempt_count() && !irqs_disabled())
20035 +                       return;
20036 +
20037 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20038 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
20039 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
20040 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
20041 +                       start = ftrace_now(cpu);
20042 +                       time_set++;
20043 +                       per_cpu(hist_irqsoff_start, cpu) = start;
20044 +               }
20045 +#endif
20046 +
20047 +#ifdef CONFIG_PREEMPT_OFF_HIST
20048 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
20049 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
20050 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
20051 +                       if (!(time_set++))
20052 +                               start = ftrace_now(cpu);
20053 +                       per_cpu(hist_preemptoff_start, cpu) = start;
20054 +               }
20055 +#endif
20056 +
20057 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
20058 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
20059 +                   per_cpu(hist_preemptoff_counting, cpu) &&
20060 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
20061 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
20062 +                       if (!time_set)
20063 +                               start = ftrace_now(cpu);
20064 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
20065 +               }
20066 +#endif
20067 +       } else {
20068 +               cycle_t uninitialized_var(stop);
20069 +
20070 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20071 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
20072 +                   per_cpu(hist_irqsoff_counting, cpu)) {
20073 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
20074 +
20075 +                       stop = ftrace_now(cpu);
20076 +                       time_set++;
20077 +                       if (start) {
20078 +                               long latency = ((long) (stop - start)) /
20079 +                                   NSECS_PER_USECS;
20080 +
20081 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
20082 +                                   stop, NULL);
20083 +                       }
20084 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
20085 +               }
20086 +#endif
20087 +
20088 +#ifdef CONFIG_PREEMPT_OFF_HIST
20089 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
20090 +                   per_cpu(hist_preemptoff_counting, cpu)) {
20091 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
20092 +
20093 +                       if (!(time_set++))
20094 +                               stop = ftrace_now(cpu);
20095 +                       if (start) {
20096 +                               long latency = ((long) (stop - start)) /
20097 +                                   NSECS_PER_USECS;
20098 +
20099 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
20100 +                                   0, stop, NULL);
20101 +                       }
20102 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
20103 +               }
20104 +#endif
20105 +
20106 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
20107 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
20108 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
20109 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
20110 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
20111 +
20112 +                       if (!time_set)
20113 +                               stop = ftrace_now(cpu);
20114 +                       if (start) {
20115 +                               long latency = ((long) (stop - start)) /
20116 +                                   NSECS_PER_USECS;
20117 +
20118 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
20119 +                                   latency, 0, stop, NULL);
20120 +                       }
20121 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
20122 +               }
20123 +#endif
20124 +       }
20125 +}
20126 +#endif
20127 +
20128 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20129 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
20130 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
20131 +       int cpu)
20132 +{
20133 +       int old_cpu = task_cpu(task);
20134 +
20135 +       if (cpu != old_cpu) {
20136 +               unsigned long flags;
20137 +               struct task_struct *cpu_wakeup_task;
20138 +
20139 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
20140 +
20141 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
20142 +               if (task == cpu_wakeup_task) {
20143 +                       put_task_struct(cpu_wakeup_task);
20144 +                       per_cpu(wakeup_task, old_cpu) = NULL;
20145 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
20146 +                       get_task_struct(cpu_wakeup_task);
20147 +               }
20148 +
20149 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
20150 +       }
20151 +}
20152 +
20153 +static notrace void probe_wakeup_latency_hist_start(void *v,
20154 +       struct task_struct *p)
20155 +{
20156 +       unsigned long flags;
20157 +       struct task_struct *curr = current;
20158 +       int cpu = task_cpu(p);
20159 +       struct task_struct *cpu_wakeup_task;
20160 +
20161 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
20162 +
20163 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
20164 +
20165 +       if (wakeup_pid) {
20166 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
20167 +                   p->prio == curr->prio)
20168 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
20169 +               if (likely(wakeup_pid != task_pid_nr(p)))
20170 +                       goto out;
20171 +       } else {
20172 +               if (likely(!rt_task(p)) ||
20173 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
20174 +                   p->prio > curr->prio)
20175 +                       goto out;
20176 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
20177 +                   p->prio == curr->prio)
20178 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
20179 +       }
20180 +
20181 +       if (cpu_wakeup_task)
20182 +               put_task_struct(cpu_wakeup_task);
20183 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
20184 +       get_task_struct(cpu_wakeup_task);
20185 +       cpu_wakeup_task->preempt_timestamp_hist =
20186 +               ftrace_now(raw_smp_processor_id());
20187 +out:
20188 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
20189 +}
20190 +
20191 +static notrace void probe_wakeup_latency_hist_stop(void *v,
20192 +       bool preempt, struct task_struct *prev, struct task_struct *next)
20193 +{
20194 +       unsigned long flags;
20195 +       int cpu = task_cpu(next);
20196 +       long latency;
20197 +       cycle_t stop;
20198 +       struct task_struct *cpu_wakeup_task;
20199 +
20200 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
20201 +
20202 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
20203 +
20204 +       if (cpu_wakeup_task == NULL)
20205 +               goto out;
20206 +
20207 +       /* Already running? */
20208 +       if (unlikely(current == cpu_wakeup_task))
20209 +               goto out_reset;
20210 +
20211 +       if (next != cpu_wakeup_task) {
20212 +               if (next->prio < cpu_wakeup_task->prio)
20213 +                       goto out_reset;
20214 +
20215 +               if (next->prio == cpu_wakeup_task->prio)
20216 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
20217 +
20218 +               goto out;
20219 +       }
20220 +
20221 +       if (current->prio == cpu_wakeup_task->prio)
20222 +               per_cpu(wakeup_sharedprio, cpu) = 1;
20223 +
20224 +       /*
20225 +        * The task we are waiting for is about to be switched to.
20226 +        * Calculate latency and store it in histogram.
20227 +        */
20228 +       stop = ftrace_now(raw_smp_processor_id());
20229 +
20230 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
20231 +           NSECS_PER_USECS;
20232 +
20233 +       if (per_cpu(wakeup_sharedprio, cpu)) {
20234 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
20235 +                   next);
20236 +               per_cpu(wakeup_sharedprio, cpu) = 0;
20237 +       } else {
20238 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
20239 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20240 +               if (timerandwakeup_enabled_data.enabled) {
20241 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
20242 +                           next->timer_offset + latency, next->timer_offset,
20243 +                           stop, next);
20244 +               }
20245 +#endif
20246 +       }
20247 +
20248 +out_reset:
20249 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20250 +       next->timer_offset = 0;
20251 +#endif
20252 +       put_task_struct(cpu_wakeup_task);
20253 +       per_cpu(wakeup_task, cpu) = NULL;
20254 +out:
20255 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
20256 +}
20257 +#endif
20258 +
20259 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20260 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
20261 +       long long latency_ns, struct task_struct *curr,
20262 +       struct task_struct *task)
20263 +{
20264 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
20265 +           (task->prio < curr->prio ||
20266 +           (task->prio == curr->prio &&
20267 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
20268 +               long latency;
20269 +               cycle_t now;
20270 +
20271 +               if (missed_timer_offsets_pid) {
20272 +                       if (likely(missed_timer_offsets_pid !=
20273 +                           task_pid_nr(task)))
20274 +                               return;
20275 +               }
20276 +
20277 +               now = ftrace_now(cpu);
20278 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
20279 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
20280 +                   task);
20281 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20282 +               task->timer_offset = latency;
20283 +#endif
20284 +       }
20285 +}
20286 +#endif
20287 +
20288 +static __init int latency_hist_init(void)
20289 +{
20290 +       struct dentry *latency_hist_root = NULL;
20291 +       struct dentry *dentry;
20292 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20293 +       struct dentry *dentry_sharedprio;
20294 +#endif
20295 +       struct dentry *entry;
20296 +       struct dentry *enable_root;
20297 +       int i = 0;
20298 +       struct hist_data *my_hist;
20299 +       char name[64];
20300 +       char *cpufmt = "CPU%d";
20301 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20302 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20303 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
20304 +       struct maxlatproc_data *mp = NULL;
20305 +#endif
20306 +
20307 +       dentry = tracing_init_dentry();
20308 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
20309 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
20310 +
20311 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20312 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
20313 +       for_each_possible_cpu(i) {
20314 +               sprintf(name, cpufmt, i);
20315 +               entry = debugfs_create_file(name, 0444, dentry,
20316 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
20317 +               my_hist = &per_cpu(irqsoff_hist, i);
20318 +               atomic_set(&my_hist->hist_mode, 1);
20319 +               my_hist->min_lat = LONG_MAX;
20320 +       }
20321 +       entry = debugfs_create_file("reset", 0644, dentry,
20322 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
20323 +#endif
20324 +
20325 +#ifdef CONFIG_PREEMPT_OFF_HIST
20326 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
20327 +           latency_hist_root);
20328 +       for_each_possible_cpu(i) {
20329 +               sprintf(name, cpufmt, i);
20330 +               entry = debugfs_create_file(name, 0444, dentry,
20331 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
20332 +               my_hist = &per_cpu(preemptoff_hist, i);
20333 +               atomic_set(&my_hist->hist_mode, 1);
20334 +               my_hist->min_lat = LONG_MAX;
20335 +       }
20336 +       entry = debugfs_create_file("reset", 0644, dentry,
20337 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
20338 +#endif
20339 +
20340 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
20341 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
20342 +           latency_hist_root);
20343 +       for_each_possible_cpu(i) {
20344 +               sprintf(name, cpufmt, i);
20345 +               entry = debugfs_create_file(name, 0444, dentry,
20346 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
20347 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
20348 +               atomic_set(&my_hist->hist_mode, 1);
20349 +               my_hist->min_lat = LONG_MAX;
20350 +       }
20351 +       entry = debugfs_create_file("reset", 0644, dentry,
20352 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
20353 +#endif
20354 +
20355 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
20356 +       entry = debugfs_create_file("preemptirqsoff", 0644,
20357 +           enable_root, (void *)&preemptirqsoff_enabled_data,
20358 +           &enable_fops);
20359 +#endif
20360 +
20361 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20362 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
20363 +           latency_hist_root);
20364 +       dentry_sharedprio = debugfs_create_dir(
20365 +           wakeup_latency_hist_dir_sharedprio, dentry);
20366 +       for_each_possible_cpu(i) {
20367 +               sprintf(name, cpufmt, i);
20368 +
20369 +               entry = debugfs_create_file(name, 0444, dentry,
20370 +                   &per_cpu(wakeup_latency_hist, i),
20371 +                   &latency_hist_fops);
20372 +               my_hist = &per_cpu(wakeup_latency_hist, i);
20373 +               atomic_set(&my_hist->hist_mode, 1);
20374 +               my_hist->min_lat = LONG_MAX;
20375 +
20376 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
20377 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
20378 +                   &latency_hist_fops);
20379 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
20380 +               atomic_set(&my_hist->hist_mode, 1);
20381 +               my_hist->min_lat = LONG_MAX;
20382 +
20383 +               sprintf(name, cpufmt_maxlatproc, i);
20384 +
20385 +               mp = &per_cpu(wakeup_maxlatproc, i);
20386 +               entry = debugfs_create_file(name, 0444, dentry, mp,
20387 +                   &maxlatproc_fops);
20388 +               clear_maxlatprocdata(mp);
20389 +
20390 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
20391 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
20392 +                   &maxlatproc_fops);
20393 +               clear_maxlatprocdata(mp);
20394 +       }
20395 +       entry = debugfs_create_file("pid", 0644, dentry,
20396 +           (void *)&wakeup_pid, &pid_fops);
20397 +       entry = debugfs_create_file("reset", 0644, dentry,
20398 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
20399 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
20400 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
20401 +       entry = debugfs_create_file("wakeup", 0644,
20402 +           enable_root, (void *)&wakeup_latency_enabled_data,
20403 +           &enable_fops);
20404 +#endif
20405 +
20406 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20407 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
20408 +           latency_hist_root);
20409 +       for_each_possible_cpu(i) {
20410 +               sprintf(name, cpufmt, i);
20411 +               entry = debugfs_create_file(name, 0444, dentry,
20412 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
20413 +               my_hist = &per_cpu(missed_timer_offsets, i);
20414 +               atomic_set(&my_hist->hist_mode, 1);
20415 +               my_hist->min_lat = LONG_MAX;
20416 +
20417 +               sprintf(name, cpufmt_maxlatproc, i);
20418 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
20419 +               entry = debugfs_create_file(name, 0444, dentry, mp,
20420 +                   &maxlatproc_fops);
20421 +               clear_maxlatprocdata(mp);
20422 +       }
20423 +       entry = debugfs_create_file("pid", 0644, dentry,
20424 +           (void *)&missed_timer_offsets_pid, &pid_fops);
20425 +       entry = debugfs_create_file("reset", 0644, dentry,
20426 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
20427 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
20428 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
20429 +           &enable_fops);
20430 +#endif
20431 +
20432 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20433 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20434 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
20435 +           latency_hist_root);
20436 +       for_each_possible_cpu(i) {
20437 +               sprintf(name, cpufmt, i);
20438 +               entry = debugfs_create_file(name, 0444, dentry,
20439 +                   &per_cpu(timerandwakeup_latency_hist, i),
20440 +                   &latency_hist_fops);
20441 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
20442 +               atomic_set(&my_hist->hist_mode, 1);
20443 +               my_hist->min_lat = LONG_MAX;
20444 +
20445 +               sprintf(name, cpufmt_maxlatproc, i);
20446 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
20447 +               entry = debugfs_create_file(name, 0444, dentry, mp,
20448 +                   &maxlatproc_fops);
20449 +               clear_maxlatprocdata(mp);
20450 +       }
20451 +       entry = debugfs_create_file("reset", 0644, dentry,
20452 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
20453 +       entry = debugfs_create_file("timerandwakeup", 0644,
20454 +           enable_root, (void *)&timerandwakeup_enabled_data,
20455 +           &enable_fops);
20456 +#endif
20457 +       return 0;
20458 +}
20459 +
20460 +device_initcall(latency_hist_init);
20461 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
20462 index 8696ce6bf2f6..277f048a4695 100644
20463 --- a/kernel/trace/trace.c
20464 +++ b/kernel/trace/trace.c
20465 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
20466         struct task_struct *tsk = current;
20467
20468         entry->preempt_count            = pc & 0xff;
20469 +       entry->preempt_lazy_count       = preempt_lazy_count();
20470         entry->pid                      = (tsk) ? tsk->pid : 0;
20471         entry->flags =
20472  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
20473 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
20474                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
20475                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
20476                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
20477 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
20478 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
20479 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
20480                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
20481 +
20482 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
20483  }
20484  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
20485
20486 @@ -2892,14 +2896,17 @@ get_total_entries(struct trace_buffer *buf,
20487
20488  static void print_lat_help_header(struct seq_file *m)
20489  {
20490 -       seq_puts(m, "#                  _------=> CPU#            \n"
20491 -                   "#                 / _-----=> irqs-off        \n"
20492 -                   "#                | / _----=> need-resched    \n"
20493 -                   "#                || / _---=> hardirq/softirq \n"
20494 -                   "#                ||| / _--=> preempt-depth   \n"
20495 -                   "#                |||| /     delay            \n"
20496 -                   "#  cmd     pid   ||||| time  |   caller      \n"
20497 -                   "#     \\   /      |||||  \\    |   /         \n");
20498 +       seq_puts(m, "#                  _--------=> CPU#              \n"
20499 +                   "#                 / _-------=> irqs-off          \n"
20500 +                   "#                | / _------=> need-resched      \n"
20501 +                   "#                || / _-----=> need-resched_lazy \n"
20502 +                   "#                ||| / _----=> hardirq/softirq   \n"
20503 +                   "#                |||| / _---=> preempt-depth     \n"
20504 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
20505 +                   "#                |||||| / _-=> migrate-disable   \n"
20506 +                   "#                ||||||| /     delay             \n"
20507 +                   "# cmd     pid    |||||||| time   |  caller       \n"
20508 +                   "#     \\   /      ||||||||   \\    |  /            \n");
20509  }
20510
20511  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
20512 @@ -2925,11 +2932,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
20513         print_event_info(buf, m);
20514         seq_puts(m, "#                              _-----=> irqs-off\n"
20515                     "#                             / _----=> need-resched\n"
20516 -                   "#                            | / _---=> hardirq/softirq\n"
20517 -                   "#                            || / _--=> preempt-depth\n"
20518 -                   "#                            ||| /     delay\n"
20519 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
20520 -                   "#              | |       |   ||||       |         |\n");
20521 +                   "#                            |/  _-----=> need-resched_lazy\n"
20522 +                   "#                            || / _---=> hardirq/softirq\n"
20523 +                   "#                            ||| / _--=> preempt-depth\n"
20524 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
20525 +                   "#                            ||||| / _-=> migrate-disable   \n"
20526 +                   "#                            |||||| /    delay\n"
20527 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
20528 +                   "#              | |       |   |||||||      |         |\n");
20529  }
20530
20531  void
20532 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
20533 index fd24b1f9ac43..852b2c81be25 100644
20534 --- a/kernel/trace/trace.h
20535 +++ b/kernel/trace/trace.h
20536 @@ -124,6 +124,7 @@ struct kretprobe_trace_entry_head {
20537   *  NEED_RESCHED       - reschedule is requested
20538   *  HARDIRQ            - inside an interrupt handler
20539   *  SOFTIRQ            - inside a softirq handler
20540 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
20541   */
20542  enum trace_flag_type {
20543         TRACE_FLAG_IRQS_OFF             = 0x01,
20544 @@ -133,6 +134,7 @@ enum trace_flag_type {
20545         TRACE_FLAG_SOFTIRQ              = 0x10,
20546         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
20547         TRACE_FLAG_NMI                  = 0x40,
20548 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
20549  };
20550
20551  #define TRACE_BUF_SIZE         1024
20552 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
20553 index 03c0a48c3ac4..0b85d516b491 100644
20554 --- a/kernel/trace/trace_events.c
20555 +++ b/kernel/trace/trace_events.c
20556 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
20557         __common_field(unsigned char, flags);
20558         __common_field(unsigned char, preempt_count);
20559         __common_field(int, pid);
20560 +       __common_field(unsigned short, migrate_disable);
20561 +       __common_field(unsigned short, padding);
20562
20563         return ret;
20564  }
20565 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
20566 index 03cdff84d026..940bd10b4406 100644
20567 --- a/kernel/trace/trace_irqsoff.c
20568 +++ b/kernel/trace/trace_irqsoff.c
20569 @@ -13,6 +13,7 @@
20570  #include <linux/uaccess.h>
20571  #include <linux/module.h>
20572  #include <linux/ftrace.h>
20573 +#include <trace/events/hist.h>
20574
20575  #include "trace.h"
20576
20577 @@ -424,11 +425,13 @@ void start_critical_timings(void)
20578  {
20579         if (preempt_trace() || irq_trace())
20580                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20581 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
20582  }
20583  EXPORT_SYMBOL_GPL(start_critical_timings);
20584
20585  void stop_critical_timings(void)
20586  {
20587 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
20588         if (preempt_trace() || irq_trace())
20589                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20590  }
20591 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
20592  #ifdef CONFIG_PROVE_LOCKING
20593  void time_hardirqs_on(unsigned long a0, unsigned long a1)
20594  {
20595 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
20596         if (!preempt_trace() && irq_trace())
20597                 stop_critical_timing(a0, a1);
20598  }
20599 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
20600  {
20601         if (!preempt_trace() && irq_trace())
20602                 start_critical_timing(a0, a1);
20603 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
20604  }
20605
20606  #else /* !CONFIG_PROVE_LOCKING */
20607 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
20608   */
20609  void trace_hardirqs_on(void)
20610  {
20611 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
20612         if (!preempt_trace() && irq_trace())
20613                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20614  }
20615 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
20616  {
20617         if (!preempt_trace() && irq_trace())
20618                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
20619 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
20620  }
20621  EXPORT_SYMBOL(trace_hardirqs_off);
20622
20623  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
20624  {
20625 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
20626         if (!preempt_trace() && irq_trace())
20627                 stop_critical_timing(CALLER_ADDR0, caller_addr);
20628  }
20629 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
20630  {
20631         if (!preempt_trace() && irq_trace())
20632                 start_critical_timing(CALLER_ADDR0, caller_addr);
20633 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
20634  }
20635  EXPORT_SYMBOL(trace_hardirqs_off_caller);
20636
20637 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
20638  #ifdef CONFIG_PREEMPT_TRACER
20639  void trace_preempt_on(unsigned long a0, unsigned long a1)
20640  {
20641 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
20642         if (preempt_trace() && !irq_trace())
20643                 stop_critical_timing(a0, a1);
20644  }
20645
20646  void trace_preempt_off(unsigned long a0, unsigned long a1)
20647  {
20648 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
20649         if (preempt_trace() && !irq_trace())
20650                 start_critical_timing(a0, a1);
20651  }
20652 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
20653 index 3fc20422c166..65a6dde71a7d 100644
20654 --- a/kernel/trace/trace_output.c
20655 +++ b/kernel/trace/trace_output.c
20656 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
20657  {
20658         char hardsoft_irq;
20659         char need_resched;
20660 +       char need_resched_lazy;
20661         char irqs_off;
20662         int hardirq;
20663         int softirq;
20664 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
20665                 break;
20666         }
20667
20668 +       need_resched_lazy =
20669 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
20670 +
20671         hardsoft_irq =
20672                 (nmi && hardirq)     ? 'Z' :
20673                 nmi                  ? 'z' :
20674 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
20675                 softirq              ? 's' :
20676                                        '.' ;
20677
20678 -       trace_seq_printf(s, "%c%c%c",
20679 -                        irqs_off, need_resched, hardsoft_irq);
20680 +       trace_seq_printf(s, "%c%c%c%c",
20681 +                        irqs_off, need_resched, need_resched_lazy,
20682 +                        hardsoft_irq);
20683
20684         if (entry->preempt_count)
20685                 trace_seq_printf(s, "%x", entry->preempt_count);
20686         else
20687                 trace_seq_putc(s, '.');
20688
20689 +       if (entry->preempt_lazy_count)
20690 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
20691 +       else
20692 +               trace_seq_putc(s, '.');
20693 +
20694 +       if (entry->migrate_disable)
20695 +               trace_seq_printf(s, "%x", entry->migrate_disable);
20696 +       else
20697 +               trace_seq_putc(s, '.');
20698 +
20699         return !trace_seq_has_overflowed(s);
20700  }
20701
20702 diff --git a/kernel/user.c b/kernel/user.c
20703 index b069ccbfb0b0..1a2e88e98b5e 100644
20704 --- a/kernel/user.c
20705 +++ b/kernel/user.c
20706 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
20707         if (!up)
20708                 return;
20709
20710 -       local_irq_save(flags);
20711 +       local_irq_save_nort(flags);
20712         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
20713                 free_user(up, flags);
20714         else
20715 -               local_irq_restore(flags);
20716 +               local_irq_restore_nort(flags);
20717  }
20718
20719  struct user_struct *alloc_uid(kuid_t uid)
20720 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
20721 index 6d1020c03d41..70c6a2f79f7e 100644
20722 --- a/kernel/watchdog.c
20723 +++ b/kernel/watchdog.c
20724 @@ -315,6 +315,8 @@ static int is_softlockup(unsigned long touch_ts)
20725
20726  #ifdef CONFIG_HARDLOCKUP_DETECTOR
20727
20728 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
20729 +
20730  static struct perf_event_attr wd_hw_attr = {
20731         .type           = PERF_TYPE_HARDWARE,
20732         .config         = PERF_COUNT_HW_CPU_CYCLES,
20733 @@ -348,6 +350,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
20734                 /* only print hardlockups once */
20735                 if (__this_cpu_read(hard_watchdog_warn) == true)
20736                         return;
20737 +               /*
20738 +                * If early-printk is enabled then make sure we do not
20739 +                * lock up in printk() and kill console logging:
20740 +                */
20741 +               printk_kill();
20742 +
20743 +               raw_spin_lock(&watchdog_output_lock);
20744
20745                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
20746                 print_modules();
20747 @@ -365,6 +374,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
20748                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
20749                         trigger_allbutself_cpu_backtrace();
20750
20751 +               raw_spin_unlock(&watchdog_output_lock);
20752                 if (hardlockup_panic)
20753                         nmi_panic(regs, "Hard LOCKUP");
20754
20755 @@ -512,6 +522,7 @@ static void watchdog_enable(unsigned int cpu)
20756         /* kick off the timer for the hardlockup detector */
20757         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
20758         hrtimer->function = watchdog_timer_fn;
20759 +       hrtimer->irqsafe = 1;
20760
20761         /* Enable the perf event */
20762         watchdog_nmi_enable(cpu);
20763 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
20764 index 479d840db286..24eba6620a45 100644
20765 --- a/kernel/workqueue.c
20766 +++ b/kernel/workqueue.c
20767 @@ -48,6 +48,8 @@
20768  #include <linux/nodemask.h>
20769  #include <linux/moduleparam.h>
20770  #include <linux/uaccess.h>
20771 +#include <linux/locallock.h>
20772 +#include <linux/delay.h>
20773
20774  #include "workqueue_internal.h"
20775
20776 @@ -121,11 +123,16 @@ enum {
20777   *    cpu or grabbing pool->lock is enough for read access.  If
20778   *    POOL_DISASSOCIATED is set, it's identical to L.
20779   *
20780 + *    On RT we need the extra protection via rt_lock_idle_list() for
20781 + *    the list manipulations against read access from
20782 + *    wq_worker_sleeping(). All other places are nicely serialized via
20783 + *    pool->lock.
20784 + *
20785   * A: pool->attach_mutex protected.
20786   *
20787   * PL: wq_pool_mutex protected.
20788   *
20789 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
20790 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
20791   *
20792   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
20793   *
20794 @@ -134,7 +141,7 @@ enum {
20795   *
20796   * WQ: wq->mutex protected.
20797   *
20798 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
20799 + * WR: wq->mutex protected for writes.  RCU protected for reads.
20800   *
20801   * MD: wq_mayday_lock protected.
20802   */
20803 @@ -185,7 +192,7 @@ struct worker_pool {
20804         atomic_t                nr_running ____cacheline_aligned_in_smp;
20805
20806         /*
20807 -        * Destruction of pool is sched-RCU protected to allow dereferences
20808 +        * Destruction of pool is RCU protected to allow dereferences
20809          * from get_work_pool().
20810          */
20811         struct rcu_head         rcu;
20812 @@ -214,7 +221,7 @@ struct pool_workqueue {
20813         /*
20814          * Release of unbound pwq is punted to system_wq.  See put_pwq()
20815          * and pwq_unbound_release_workfn() for details.  pool_workqueue
20816 -        * itself is also sched-RCU protected so that the first pwq can be
20817 +        * itself is also RCU protected so that the first pwq can be
20818          * determined without grabbing wq->mutex.
20819          */
20820         struct work_struct      unbound_release_work;
20821 @@ -348,6 +355,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
20822  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
20823  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
20824
20825 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
20826 +
20827  static int worker_thread(void *__worker);
20828  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20829
20830 @@ -355,20 +364,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20831  #include <trace/events/workqueue.h>
20832
20833  #define assert_rcu_or_pool_mutex()                                     \
20834 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20835 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20836                          !lockdep_is_held(&wq_pool_mutex),              \
20837 -                        "sched RCU or wq_pool_mutex should be held")
20838 +                        "RCU or wq_pool_mutex should be held")
20839
20840  #define assert_rcu_or_wq_mutex(wq)                                     \
20841 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20842 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20843                          !lockdep_is_held(&wq->mutex),                  \
20844 -                        "sched RCU or wq->mutex should be held")
20845 +                        "RCU or wq->mutex should be held")
20846
20847  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
20848 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
20849 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
20850                          !lockdep_is_held(&wq->mutex) &&                \
20851                          !lockdep_is_held(&wq_pool_mutex),              \
20852 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
20853 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
20854
20855  #define for_each_cpu_worker_pool(pool, cpu)                            \
20856         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
20857 @@ -380,7 +389,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20858   * @pool: iteration cursor
20859   * @pi: integer used for iteration
20860   *
20861 - * This must be called either with wq_pool_mutex held or sched RCU read
20862 + * This must be called either with wq_pool_mutex held or RCU read
20863   * locked.  If the pool needs to be used beyond the locking in effect, the
20864   * caller is responsible for guaranteeing that the pool stays online.
20865   *
20866 @@ -412,7 +421,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20867   * @pwq: iteration cursor
20868   * @wq: the target workqueue
20869   *
20870 - * This must be called either with wq->mutex held or sched RCU read locked.
20871 + * This must be called either with wq->mutex held or RCU read locked.
20872   * If the pwq needs to be used beyond the locking in effect, the caller is
20873   * responsible for guaranteeing that the pwq stays online.
20874   *
20875 @@ -424,6 +433,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
20876                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
20877                 else
20878
20879 +#ifdef CONFIG_PREEMPT_RT_BASE
20880 +static inline void rt_lock_idle_list(struct worker_pool *pool)
20881 +{
20882 +       preempt_disable();
20883 +}
20884 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
20885 +{
20886 +       preempt_enable();
20887 +}
20888 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
20889 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
20890 +#else
20891 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
20892 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
20893 +static inline void sched_lock_idle_list(struct worker_pool *pool)
20894 +{
20895 +       spin_lock_irq(&pool->lock);
20896 +}
20897 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
20898 +{
20899 +       spin_unlock_irq(&pool->lock);
20900 +}
20901 +#endif
20902 +
20903 +
20904  #ifdef CONFIG_DEBUG_OBJECTS_WORK
20905
20906  static struct debug_obj_descr work_debug_descr;
20907 @@ -548,7 +582,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
20908   * @wq: the target workqueue
20909   * @node: the node ID
20910   *
20911 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
20912 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
20913   * read locked.
20914   * If the pwq needs to be used beyond the locking in effect, the caller is
20915   * responsible for guaranteeing that the pwq stays online.
20916 @@ -692,8 +726,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
20917   * @work: the work item of interest
20918   *
20919   * Pools are created and destroyed under wq_pool_mutex, and allows read
20920 - * access under sched-RCU read lock.  As such, this function should be
20921 - * called under wq_pool_mutex or with preemption disabled.
20922 + * access under RCU read lock.  As such, this function should be
20923 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
20924   *
20925   * All fields of the returned pool are accessible as long as the above
20926   * mentioned locking is in effect.  If the returned pool needs to be used
20927 @@ -830,50 +864,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
20928   */
20929  static void wake_up_worker(struct worker_pool *pool)
20930  {
20931 -       struct worker *worker = first_idle_worker(pool);
20932 +       struct worker *worker;
20933 +
20934 +       rt_lock_idle_list(pool);
20935 +
20936 +       worker = first_idle_worker(pool);
20937
20938         if (likely(worker))
20939                 wake_up_process(worker->task);
20940 +
20941 +       rt_unlock_idle_list(pool);
20942  }
20943
20944  /**
20945 - * wq_worker_waking_up - a worker is waking up
20946 + * wq_worker_running - a worker is running again
20947   * @task: task waking up
20948 - * @cpu: CPU @task is waking up to
20949   *
20950 - * This function is called during try_to_wake_up() when a worker is
20951 - * being awoken.
20952 - *
20953 - * CONTEXT:
20954 - * spin_lock_irq(rq->lock)
20955 + * This function is called when a worker returns from schedule()
20956   */
20957 -void wq_worker_waking_up(struct task_struct *task, int cpu)
20958 +void wq_worker_running(struct task_struct *task)
20959  {
20960         struct worker *worker = kthread_data(task);
20961
20962 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
20963 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
20964 +       if (!worker->sleeping)
20965 +               return;
20966 +       if (!(worker->flags & WORKER_NOT_RUNNING))
20967                 atomic_inc(&worker->pool->nr_running);
20968 -       }
20969 +       worker->sleeping = 0;
20970  }
20971
20972  /**
20973   * wq_worker_sleeping - a worker is going to sleep
20974   * @task: task going to sleep
20975   *
20976 - * This function is called during schedule() when a busy worker is
20977 - * going to sleep.  Worker on the same cpu can be woken up by
20978 - * returning pointer to its task.
20979 - *
20980 - * CONTEXT:
20981 - * spin_lock_irq(rq->lock)
20982 - *
20983 - * Return:
20984 - * Worker task on @cpu to wake up, %NULL if none.
20985 + * This function is called from schedule() when a busy worker is
20986 + * going to sleep.
20987   */
20988 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
20989 +void wq_worker_sleeping(struct task_struct *task)
20990  {
20991 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
20992 +       struct worker *worker = kthread_data(task);
20993         struct worker_pool *pool;
20994
20995         /*
20996 @@ -882,29 +911,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
20997          * checking NOT_RUNNING.
20998          */
20999         if (worker->flags & WORKER_NOT_RUNNING)
21000 -               return NULL;
21001 +               return;
21002
21003         pool = worker->pool;
21004
21005 -       /* this can only happen on the local cpu */
21006 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
21007 -               return NULL;
21008 +       if (WARN_ON_ONCE(worker->sleeping))
21009 +               return;
21010 +
21011 +       worker->sleeping = 1;
21012
21013         /*
21014          * The counterpart of the following dec_and_test, implied mb,
21015          * worklist not empty test sequence is in insert_work().
21016          * Please read comment there.
21017 -        *
21018 -        * NOT_RUNNING is clear.  This means that we're bound to and
21019 -        * running on the local cpu w/ rq lock held and preemption
21020 -        * disabled, which in turn means that none else could be
21021 -        * manipulating idle_list, so dereferencing idle_list without pool
21022 -        * lock is safe.
21023          */
21024         if (atomic_dec_and_test(&pool->nr_running) &&
21025 -           !list_empty(&pool->worklist))
21026 -               to_wakeup = first_idle_worker(pool);
21027 -       return to_wakeup ? to_wakeup->task : NULL;
21028 +           !list_empty(&pool->worklist)) {
21029 +               sched_lock_idle_list(pool);
21030 +               wake_up_worker(pool);
21031 +               sched_unlock_idle_list(pool);
21032 +       }
21033  }
21034
21035  /**
21036 @@ -1098,12 +1124,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
21037  {
21038         if (pwq) {
21039                 /*
21040 -                * As both pwqs and pools are sched-RCU protected, the
21041 +                * As both pwqs and pools are RCU protected, the
21042                  * following lock operations are safe.
21043                  */
21044 -               spin_lock_irq(&pwq->pool->lock);
21045 +               rcu_read_lock();
21046 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
21047                 put_pwq(pwq);
21048 -               spin_unlock_irq(&pwq->pool->lock);
21049 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
21050 +               rcu_read_unlock();
21051         }
21052  }
21053
21054 @@ -1207,7 +1235,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
21055         struct worker_pool *pool;
21056         struct pool_workqueue *pwq;
21057
21058 -       local_irq_save(*flags);
21059 +       local_lock_irqsave(pendingb_lock, *flags);
21060
21061         /* try to steal the timer if it exists */
21062         if (is_dwork) {
21063 @@ -1226,6 +1254,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
21064         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
21065                 return 0;
21066
21067 +       rcu_read_lock();
21068         /*
21069          * The queueing is in progress, or it is already queued. Try to
21070          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
21071 @@ -1264,14 +1293,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
21072                 set_work_pool_and_keep_pending(work, pool->id);
21073
21074                 spin_unlock(&pool->lock);
21075 +               rcu_read_unlock();
21076                 return 1;
21077         }
21078         spin_unlock(&pool->lock);
21079  fail:
21080 -       local_irq_restore(*flags);
21081 +       rcu_read_unlock();
21082 +       local_unlock_irqrestore(pendingb_lock, *flags);
21083         if (work_is_canceling(work))
21084                 return -ENOENT;
21085 -       cpu_relax();
21086 +       cpu_chill();
21087         return -EAGAIN;
21088  }
21089
21090 @@ -1373,7 +1404,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
21091          * queued or lose PENDING.  Grabbing PENDING and queueing should
21092          * happen with IRQ disabled.
21093          */
21094 -       WARN_ON_ONCE(!irqs_disabled());
21095 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
21096
21097         debug_work_activate(work);
21098
21099 @@ -1381,6 +1412,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
21100         if (unlikely(wq->flags & __WQ_DRAINING) &&
21101             WARN_ON_ONCE(!is_chained_work(wq)))
21102                 return;
21103 +       rcu_read_lock();
21104  retry:
21105         if (req_cpu == WORK_CPU_UNBOUND)
21106                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
21107 @@ -1437,10 +1469,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
21108         /* pwq determined, queue */
21109         trace_workqueue_queue_work(req_cpu, pwq, work);
21110
21111 -       if (WARN_ON(!list_empty(&work->entry))) {
21112 -               spin_unlock(&pwq->pool->lock);
21113 -               return;
21114 -       }
21115 +       if (WARN_ON(!list_empty(&work->entry)))
21116 +               goto out;
21117
21118         pwq->nr_in_flight[pwq->work_color]++;
21119         work_flags = work_color_to_flags(pwq->work_color);
21120 @@ -1458,7 +1488,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
21121
21122         insert_work(pwq, work, worklist, work_flags);
21123
21124 +out:
21125         spin_unlock(&pwq->pool->lock);
21126 +       rcu_read_unlock();
21127  }
21128
21129  /**
21130 @@ -1478,14 +1510,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
21131         bool ret = false;
21132         unsigned long flags;
21133
21134 -       local_irq_save(flags);
21135 +       local_lock_irqsave(pendingb_lock,flags);
21136
21137         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
21138                 __queue_work(cpu, wq, work);
21139                 ret = true;
21140         }
21141
21142 -       local_irq_restore(flags);
21143 +       local_unlock_irqrestore(pendingb_lock, flags);
21144         return ret;
21145  }
21146  EXPORT_SYMBOL(queue_work_on);
21147 @@ -1552,14 +1584,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
21148         unsigned long flags;
21149
21150         /* read the comment in __queue_work() */
21151 -       local_irq_save(flags);
21152 +       local_lock_irqsave(pendingb_lock, flags);
21153
21154         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
21155                 __queue_delayed_work(cpu, wq, dwork, delay);
21156                 ret = true;
21157         }
21158
21159 -       local_irq_restore(flags);
21160 +       local_unlock_irqrestore(pendingb_lock, flags);
21161         return ret;
21162  }
21163  EXPORT_SYMBOL(queue_delayed_work_on);
21164 @@ -1594,7 +1626,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
21165
21166         if (likely(ret >= 0)) {
21167                 __queue_delayed_work(cpu, wq, dwork, delay);
21168 -               local_irq_restore(flags);
21169 +               local_unlock_irqrestore(pendingb_lock, flags);
21170         }
21171
21172         /* -ENOENT from try_to_grab_pending() becomes %true */
21173 @@ -1627,7 +1659,9 @@ static void worker_enter_idle(struct worker *worker)
21174         worker->last_active = jiffies;
21175
21176         /* idle_list is LIFO */
21177 +       rt_lock_idle_list(pool);
21178         list_add(&worker->entry, &pool->idle_list);
21179 +       rt_unlock_idle_list(pool);
21180
21181         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
21182                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
21183 @@ -1660,7 +1694,9 @@ static void worker_leave_idle(struct worker *worker)
21184                 return;
21185         worker_clr_flags(worker, WORKER_IDLE);
21186         pool->nr_idle--;
21187 +       rt_lock_idle_list(pool);
21188         list_del_init(&worker->entry);
21189 +       rt_unlock_idle_list(pool);
21190  }
21191
21192  static struct worker *alloc_worker(int node)
21193 @@ -1826,7 +1862,9 @@ static void destroy_worker(struct worker *worker)
21194         pool->nr_workers--;
21195         pool->nr_idle--;
21196
21197 +       rt_lock_idle_list(pool);
21198         list_del_init(&worker->entry);
21199 +       rt_unlock_idle_list(pool);
21200         worker->flags |= WORKER_DIE;
21201         wake_up_process(worker->task);
21202  }
21203 @@ -2785,14 +2823,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
21204
21205         might_sleep();
21206
21207 -       local_irq_disable();
21208 +       rcu_read_lock();
21209         pool = get_work_pool(work);
21210         if (!pool) {
21211 -               local_irq_enable();
21212 +               rcu_read_unlock();
21213                 return false;
21214         }
21215
21216 -       spin_lock(&pool->lock);
21217 +       spin_lock_irq(&pool->lock);
21218         /* see the comment in try_to_grab_pending() with the same code */
21219         pwq = get_work_pwq(work);
21220         if (pwq) {
21221 @@ -2821,10 +2859,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
21222         else
21223                 lock_map_acquire_read(&pwq->wq->lockdep_map);
21224         lock_map_release(&pwq->wq->lockdep_map);
21225 -
21226 +       rcu_read_unlock();
21227         return true;
21228  already_gone:
21229         spin_unlock_irq(&pool->lock);
21230 +       rcu_read_unlock();
21231         return false;
21232  }
21233
21234 @@ -2911,7 +2950,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
21235
21236         /* tell other tasks trying to grab @work to back off */
21237         mark_work_canceling(work);
21238 -       local_irq_restore(flags);
21239 +       local_unlock_irqrestore(pendingb_lock, flags);
21240
21241         flush_work(work);
21242         clear_work_data(work);
21243 @@ -2966,10 +3005,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
21244   */
21245  bool flush_delayed_work(struct delayed_work *dwork)
21246  {
21247 -       local_irq_disable();
21248 +       local_lock_irq(pendingb_lock);
21249         if (del_timer_sync(&dwork->timer))
21250                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
21251 -       local_irq_enable();
21252 +       local_unlock_irq(pendingb_lock);
21253         return flush_work(&dwork->work);
21254  }
21255  EXPORT_SYMBOL(flush_delayed_work);
21256 @@ -2987,7 +3026,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork)
21257                 return false;
21258
21259         set_work_pool_and_clear_pending(work, get_work_pool_id(work));
21260 -       local_irq_restore(flags);
21261 +       local_unlock_irqrestore(pendingb_lock, flags);
21262         return ret;
21263  }
21264
21265 @@ -3245,7 +3284,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
21266   * put_unbound_pool - put a worker_pool
21267   * @pool: worker_pool to put
21268   *
21269 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
21270 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
21271   * safe manner.  get_unbound_pool() calls this function on its failure path
21272   * and this function should be able to release pools which went through,
21273   * successfully or not, init_worker_pool().
21274 @@ -3299,8 +3338,8 @@ static void put_unbound_pool(struct worker_pool *pool)
21275         del_timer_sync(&pool->idle_timer);
21276         del_timer_sync(&pool->mayday_timer);
21277
21278 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
21279 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
21280 +       /* RCU protected to allow dereferences from get_work_pool() */
21281 +       call_rcu(&pool->rcu, rcu_free_pool);
21282  }
21283
21284  /**
21285 @@ -3407,14 +3446,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
21286         put_unbound_pool(pool);
21287         mutex_unlock(&wq_pool_mutex);
21288
21289 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
21290 +       call_rcu(&pwq->rcu, rcu_free_pwq);
21291
21292         /*
21293          * If we're the last pwq going away, @wq is already dead and no one
21294          * is gonna access it anymore.  Schedule RCU free.
21295          */
21296         if (is_last)
21297 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
21298 +               call_rcu(&wq->rcu, rcu_free_wq);
21299  }
21300
21301  /**
21302 @@ -4064,7 +4103,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
21303                  * The base ref is never dropped on per-cpu pwqs.  Directly
21304                  * schedule RCU free.
21305                  */
21306 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
21307 +               call_rcu(&wq->rcu, rcu_free_wq);
21308         } else {
21309                 /*
21310                  * We're the sole accessor of @wq at this point.  Directly
21311 @@ -4157,7 +4196,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
21312         struct pool_workqueue *pwq;
21313         bool ret;
21314
21315 -       rcu_read_lock_sched();
21316 +       rcu_read_lock();
21317 +       preempt_disable();
21318
21319         if (cpu == WORK_CPU_UNBOUND)
21320                 cpu = smp_processor_id();
21321 @@ -4168,7 +4208,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
21322                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
21323
21324         ret = !list_empty(&pwq->delayed_works);
21325 -       rcu_read_unlock_sched();
21326 +       preempt_enable();
21327 +       rcu_read_unlock();
21328
21329         return ret;
21330  }
21331 @@ -4194,15 +4235,15 @@ unsigned int work_busy(struct work_struct *work)
21332         if (work_pending(work))
21333                 ret |= WORK_BUSY_PENDING;
21334
21335 -       local_irq_save(flags);
21336 +       rcu_read_lock();
21337         pool = get_work_pool(work);
21338         if (pool) {
21339 -               spin_lock(&pool->lock);
21340 +               spin_lock_irqsave(&pool->lock, flags);
21341                 if (find_worker_executing_work(pool, work))
21342                         ret |= WORK_BUSY_RUNNING;
21343 -               spin_unlock(&pool->lock);
21344 +               spin_unlock_irqrestore(&pool->lock, flags);
21345         }
21346 -       local_irq_restore(flags);
21347 +       rcu_read_unlock();
21348
21349         return ret;
21350  }
21351 @@ -4391,7 +4432,7 @@ void show_workqueue_state(void)
21352         unsigned long flags;
21353         int pi;
21354
21355 -       rcu_read_lock_sched();
21356 +       rcu_read_lock();
21357
21358         pr_info("Showing busy workqueues and worker pools:\n");
21359
21360 @@ -4444,7 +4485,7 @@ void show_workqueue_state(void)
21361                 spin_unlock_irqrestore(&pool->lock, flags);
21362         }
21363
21364 -       rcu_read_unlock_sched();
21365 +       rcu_read_unlock();
21366  }
21367
21368  /*
21369 @@ -4782,16 +4823,16 @@ bool freeze_workqueues_busy(void)
21370                  * nr_active is monotonically decreasing.  It's safe
21371                  * to peek without lock.
21372                  */
21373 -               rcu_read_lock_sched();
21374 +               rcu_read_lock();
21375                 for_each_pwq(pwq, wq) {
21376                         WARN_ON_ONCE(pwq->nr_active < 0);
21377                         if (pwq->nr_active) {
21378                                 busy = true;
21379 -                               rcu_read_unlock_sched();
21380 +                               rcu_read_unlock();
21381                                 goto out_unlock;
21382                         }
21383                 }
21384 -               rcu_read_unlock_sched();
21385 +               rcu_read_unlock();
21386         }
21387  out_unlock:
21388         mutex_unlock(&wq_pool_mutex);
21389 @@ -4981,7 +5022,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
21390         const char *delim = "";
21391         int node, written = 0;
21392
21393 -       rcu_read_lock_sched();
21394 +       get_online_cpus();
21395 +       rcu_read_lock();
21396         for_each_node(node) {
21397                 written += scnprintf(buf + written, PAGE_SIZE - written,
21398                                      "%s%d:%d", delim, node,
21399 @@ -4989,7 +5031,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
21400                 delim = " ";
21401         }
21402         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
21403 -       rcu_read_unlock_sched();
21404 +       rcu_read_unlock();
21405 +       put_online_cpus();
21406
21407         return written;
21408  }
21409 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
21410 index 8635417c587b..f000c4d6917e 100644
21411 --- a/kernel/workqueue_internal.h
21412 +++ b/kernel/workqueue_internal.h
21413 @@ -43,6 +43,7 @@ struct worker {
21414         unsigned long           last_active;    /* L: last active timestamp */
21415         unsigned int            flags;          /* X: flags */
21416         int                     id;             /* I: worker id */
21417 +       int                     sleeping;       /* None */
21418
21419         /*
21420          * Opaque string set with work_set_desc().  Printed out with task
21421 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
21422   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
21423   * sched/core.c and workqueue.c.
21424   */
21425 -void wq_worker_waking_up(struct task_struct *task, int cpu);
21426 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
21427 +void wq_worker_running(struct task_struct *task);
21428 +void wq_worker_sleeping(struct task_struct *task);
21429
21430  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
21431 diff --git a/lib/Kconfig b/lib/Kconfig
21432 index 260a80e313b9..b06becb3f477 100644
21433 --- a/lib/Kconfig
21434 +++ b/lib/Kconfig
21435 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
21436
21437  config CPUMASK_OFFSTACK
21438         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
21439 +       depends on !PREEMPT_RT_FULL
21440         help
21441           Use dynamic allocation for cpumask_var_t, instead of putting
21442           them on the stack.  This is a bit more expensive, but avoids
21443 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
21444 index 056052dc8e91..d8494e126de8 100644
21445 --- a/lib/debugobjects.c
21446 +++ b/lib/debugobjects.c
21447 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
21448         struct debug_obj *obj;
21449         unsigned long flags;
21450
21451 -       fill_pool();
21452 +#ifdef CONFIG_PREEMPT_RT_FULL
21453 +       if (preempt_count() == 0 && !irqs_disabled())
21454 +#endif
21455 +               fill_pool();
21456
21457         db = get_bucket((unsigned long) addr);
21458
21459 diff --git a/lib/idr.c b/lib/idr.c
21460 index 6098336df267..9decbe914595 100644
21461 --- a/lib/idr.c
21462 +++ b/lib/idr.c
21463 @@ -30,6 +30,7 @@
21464  #include <linux/idr.h>
21465  #include <linux/spinlock.h>
21466  #include <linux/percpu.h>
21467 +#include <linux/locallock.h>
21468
21469  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
21470  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
21471 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
21472  static DEFINE_PER_CPU(int, idr_preload_cnt);
21473  static DEFINE_SPINLOCK(simple_ida_lock);
21474
21475 +#ifdef CONFIG_PREEMPT_RT_FULL
21476 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
21477 +
21478 +static inline void idr_preload_lock(void)
21479 +{
21480 +       local_lock(idr_lock);
21481 +}
21482 +
21483 +static inline void idr_preload_unlock(void)
21484 +{
21485 +       local_unlock(idr_lock);
21486 +}
21487 +
21488 +void idr_preload_end(void)
21489 +{
21490 +       idr_preload_unlock();
21491 +}
21492 +EXPORT_SYMBOL(idr_preload_end);
21493 +#else
21494 +static inline void idr_preload_lock(void)
21495 +{
21496 +       preempt_disable();
21497 +}
21498 +
21499 +static inline void idr_preload_unlock(void)
21500 +{
21501 +       preempt_enable();
21502 +}
21503 +#endif
21504 +
21505 +
21506  /* the maximum ID which can be allocated given idr->layers */
21507  static int idr_max(int layers)
21508  {
21509 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
21510          * context.  See idr_preload() for details.
21511          */
21512         if (!in_interrupt()) {
21513 -               preempt_disable();
21514 +               idr_preload_lock();
21515                 new = __this_cpu_read(idr_preload_head);
21516                 if (new) {
21517                         __this_cpu_write(idr_preload_head, new->ary[0]);
21518                         __this_cpu_dec(idr_preload_cnt);
21519                         new->ary[0] = NULL;
21520                 }
21521 -               preempt_enable();
21522 +               idr_preload_unlock();
21523                 if (new)
21524                         return new;
21525         }
21526 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
21527         idr_mark_full(pa, id);
21528  }
21529
21530 -
21531  /**
21532   * idr_preload - preload for idr_alloc()
21533   * @gfp_mask: allocation mask to use for preloading
21534 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
21535         WARN_ON_ONCE(in_interrupt());
21536         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
21537
21538 -       preempt_disable();
21539 +       idr_preload_lock();
21540
21541         /*
21542          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
21543 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
21544         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
21545                 struct idr_layer *new;
21546
21547 -               preempt_enable();
21548 +               idr_preload_unlock();
21549                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
21550 -               preempt_disable();
21551 +               idr_preload_lock();
21552                 if (!new)
21553                         break;
21554
21555 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
21556 index 1d6565e81030..b23a79761df7 100644
21557 --- a/lib/irq_poll.c
21558 +++ b/lib/irq_poll.c
21559 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
21560         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
21561         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
21562         local_irq_restore(flags);
21563 +       preempt_check_resched_rt();
21564  }
21565  EXPORT_SYMBOL(irq_poll_sched);
21566
21567 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
21568         local_irq_save(flags);
21569         __irq_poll_complete(iop);
21570         local_irq_restore(flags);
21571 +       preempt_check_resched_rt();
21572  }
21573  EXPORT_SYMBOL(irq_poll_complete);
21574
21575 @@ -95,6 +97,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
21576                 }
21577
21578                 local_irq_enable();
21579 +               preempt_check_resched_rt();
21580
21581                 /* Even though interrupts have been re-enabled, this
21582                  * access is safe because interrupts can only add new
21583 @@ -132,6 +135,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
21584                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
21585
21586         local_irq_enable();
21587 +       preempt_check_resched_rt();
21588  }
21589
21590  /**
21591 @@ -195,6 +199,7 @@ static int irq_poll_cpu_dead(unsigned int cpu)
21592                          this_cpu_ptr(&blk_cpu_iopoll));
21593         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
21594         local_irq_enable();
21595 +       preempt_check_resched_rt();
21596
21597         return 0;
21598  }
21599 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
21600 index f3a217ea0388..4611b156ef79 100644
21601 --- a/lib/locking-selftest.c
21602 +++ b/lib/locking-selftest.c
21603 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
21604  #include "locking-selftest-spin-hardirq.h"
21605  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
21606
21607 +#ifndef CONFIG_PREEMPT_RT_FULL
21608 +
21609  #include "locking-selftest-rlock-hardirq.h"
21610  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
21611
21612 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
21613  #include "locking-selftest-wlock-softirq.h"
21614  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
21615
21616 +#endif
21617 +
21618  #undef E1
21619  #undef E2
21620
21621 +#ifndef CONFIG_PREEMPT_RT_FULL
21622  /*
21623   * Enabling hardirqs with a softirq-safe lock held:
21624   */
21625 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
21626  #undef E1
21627  #undef E2
21628
21629 +#endif
21630 +
21631  /*
21632   * Enabling irqs with an irq-safe lock held:
21633   */
21634 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
21635  #include "locking-selftest-spin-hardirq.h"
21636  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
21637
21638 +#ifndef CONFIG_PREEMPT_RT_FULL
21639 +
21640  #include "locking-selftest-rlock-hardirq.h"
21641  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
21642
21643 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
21644  #include "locking-selftest-wlock-softirq.h"
21645  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
21646
21647 +#endif
21648 +
21649  #undef E1
21650  #undef E2
21651
21652 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
21653  #include "locking-selftest-spin-hardirq.h"
21654  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
21655
21656 +#ifndef CONFIG_PREEMPT_RT_FULL
21657 +
21658  #include "locking-selftest-rlock-hardirq.h"
21659  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
21660
21661 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
21662  #include "locking-selftest-wlock-softirq.h"
21663  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
21664
21665 +#endif
21666 +
21667  #undef E1
21668  #undef E2
21669  #undef E3
21670 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
21671  #include "locking-selftest-spin-hardirq.h"
21672  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
21673
21674 +#ifndef CONFIG_PREEMPT_RT_FULL
21675 +
21676  #include "locking-selftest-rlock-hardirq.h"
21677  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
21678
21679 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
21680  #include "locking-selftest-wlock-softirq.h"
21681  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
21682
21683 +#endif
21684 +
21685  #undef E1
21686  #undef E2
21687  #undef E3
21688
21689 +#ifndef CONFIG_PREEMPT_RT_FULL
21690 +
21691  /*
21692   * read-lock / write-lock irq inversion.
21693   *
21694 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
21695  #undef E2
21696  #undef E3
21697
21698 +#endif
21699 +
21700 +#ifndef CONFIG_PREEMPT_RT_FULL
21701 +
21702  /*
21703   * read-lock / write-lock recursion that is actually safe.
21704   */
21705 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
21706  #undef E2
21707  #undef E3
21708
21709 +#endif
21710 +
21711  /*
21712   * read-lock / write-lock recursion that is unsafe.
21713   */
21714 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
21715
21716         printk("  --------------------------------------------------------------------------\n");
21717
21718 +#ifndef CONFIG_PREEMPT_RT_FULL
21719         /*
21720          * irq-context testcases:
21721          */
21722 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
21723
21724         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
21725  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
21726 +#else
21727 +       /* On -rt, we only do hardirq context test for raw spinlock */
21728 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
21729 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
21730 +
21731 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
21732 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
21733 +
21734 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
21735 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
21736 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
21737 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
21738 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
21739 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
21740 +
21741 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
21742 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
21743 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
21744 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
21745 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
21746 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
21747 +#endif
21748
21749         ww_tests();
21750
21751 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
21752 index 6d40944960de..822a2c027e72 100644
21753 --- a/lib/percpu_ida.c
21754 +++ b/lib/percpu_ida.c
21755 @@ -26,6 +26,9 @@
21756  #include <linux/string.h>
21757  #include <linux/spinlock.h>
21758  #include <linux/percpu_ida.h>
21759 +#include <linux/locallock.h>
21760 +
21761 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
21762
21763  struct percpu_ida_cpu {
21764         /*
21765 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21766         unsigned long flags;
21767         int tag;
21768
21769 -       local_irq_save(flags);
21770 +       local_lock_irqsave(irq_off_lock, flags);
21771         tags = this_cpu_ptr(pool->tag_cpu);
21772
21773         /* Fastpath */
21774         tag = alloc_local_tag(tags);
21775         if (likely(tag >= 0)) {
21776 -               local_irq_restore(flags);
21777 +               local_unlock_irqrestore(irq_off_lock, flags);
21778                 return tag;
21779         }
21780
21781 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21782
21783                 if (!tags->nr_free)
21784                         alloc_global_tags(pool, tags);
21785 +
21786                 if (!tags->nr_free)
21787                         steal_tags(pool, tags);
21788
21789 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21790                 }
21791
21792                 spin_unlock(&pool->lock);
21793 -               local_irq_restore(flags);
21794 +               local_unlock_irqrestore(irq_off_lock, flags);
21795
21796                 if (tag >= 0 || state == TASK_RUNNING)
21797                         break;
21798 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
21799
21800                 schedule();
21801
21802 -               local_irq_save(flags);
21803 +               local_lock_irqsave(irq_off_lock, flags);
21804                 tags = this_cpu_ptr(pool->tag_cpu);
21805         }
21806         if (state != TASK_RUNNING)
21807 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
21808
21809         BUG_ON(tag >= pool->nr_tags);
21810
21811 -       local_irq_save(flags);
21812 +       local_lock_irqsave(irq_off_lock, flags);
21813         tags = this_cpu_ptr(pool->tag_cpu);
21814
21815         spin_lock(&tags->lock);
21816 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
21817                 spin_unlock(&pool->lock);
21818         }
21819
21820 -       local_irq_restore(flags);
21821 +       local_unlock_irqrestore(irq_off_lock, flags);
21822  }
21823  EXPORT_SYMBOL_GPL(percpu_ida_free);
21824
21825 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
21826         struct percpu_ida_cpu *remote;
21827         unsigned cpu, i, err = 0;
21828
21829 -       local_irq_save(flags);
21830 +       local_lock_irqsave(irq_off_lock, flags);
21831         for_each_possible_cpu(cpu) {
21832                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
21833                 spin_lock(&remote->lock);
21834 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
21835         }
21836         spin_unlock(&pool->lock);
21837  out:
21838 -       local_irq_restore(flags);
21839 +       local_unlock_irqrestore(irq_off_lock, flags);
21840         return err;
21841  }
21842  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
21843 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
21844 index 8e6d552c40dd..741da5a77fd5 100644
21845 --- a/lib/radix-tree.c
21846 +++ b/lib/radix-tree.c
21847 @@ -36,7 +36,7 @@
21848  #include <linux/bitops.h>
21849  #include <linux/rcupdate.h>
21850  #include <linux/preempt.h>             /* in_interrupt() */
21851 -
21852 +#include <linux/locallock.h>
21853
21854  /* Number of nodes in fully populated tree of given height */
21855  static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
21856 @@ -68,6 +68,7 @@ struct radix_tree_preload {
21857         struct radix_tree_node *nodes;
21858  };
21859  static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
21860 +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock);
21861
21862  static inline void *node_to_entry(void *ptr)
21863  {
21864 @@ -290,13 +291,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
21865                  * succeed in getting a node here (and never reach
21866                  * kmem_cache_alloc)
21867                  */
21868 -               rtp = this_cpu_ptr(&radix_tree_preloads);
21869 +               rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
21870                 if (rtp->nr) {
21871                         ret = rtp->nodes;
21872                         rtp->nodes = ret->private_data;
21873                         ret->private_data = NULL;
21874                         rtp->nr--;
21875                 }
21876 +               put_locked_var(radix_tree_preloads_lock, radix_tree_preloads);
21877                 /*
21878                  * Update the allocation stack trace as this is more useful
21879                  * for debugging.
21880 @@ -357,14 +359,14 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr)
21881          */
21882         gfp_mask &= ~__GFP_ACCOUNT;
21883
21884 -       preempt_disable();
21885 +       local_lock(radix_tree_preloads_lock);
21886         rtp = this_cpu_ptr(&radix_tree_preloads);
21887         while (rtp->nr < nr) {
21888 -               preempt_enable();
21889 +               local_unlock(radix_tree_preloads_lock);
21890                 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
21891                 if (node == NULL)
21892                         goto out;
21893 -               preempt_disable();
21894 +               local_lock(radix_tree_preloads_lock);
21895                 rtp = this_cpu_ptr(&radix_tree_preloads);
21896                 if (rtp->nr < nr) {
21897                         node->private_data = rtp->nodes;
21898 @@ -406,7 +408,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
21899         if (gfpflags_allow_blocking(gfp_mask))
21900                 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
21901         /* Preloading doesn't help anything with this gfp mask, skip it */
21902 -       preempt_disable();
21903 +       local_lock(radix_tree_preloads_lock);
21904         return 0;
21905  }
21906  EXPORT_SYMBOL(radix_tree_maybe_preload);
21907 @@ -422,7 +424,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
21908
21909         /* Preloading doesn't help anything with this gfp mask, skip it */
21910         if (!gfpflags_allow_blocking(gfp_mask)) {
21911 -               preempt_disable();
21912 +               local_lock(radix_tree_preloads_lock);
21913                 return 0;
21914         }
21915
21916 @@ -456,6 +458,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
21917         return __radix_tree_preload(gfp_mask, nr_nodes);
21918  }
21919
21920 +void radix_tree_preload_end(void)
21921 +{
21922 +       local_unlock(radix_tree_preloads_lock);
21923 +}
21924 +EXPORT_SYMBOL(radix_tree_preload_end);
21925 +
21926  /*
21927   * The maximum index which can be stored in a radix tree
21928   */
21929 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
21930 index 004fc70fc56a..ccc46992a517 100644
21931 --- a/lib/scatterlist.c
21932 +++ b/lib/scatterlist.c
21933 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
21934                         flush_kernel_dcache_page(miter->page);
21935
21936                 if (miter->__flags & SG_MITER_ATOMIC) {
21937 -                       WARN_ON_ONCE(preemptible());
21938 +                       WARN_ON_ONCE(!pagefault_disabled());
21939                         kunmap_atomic(miter->addr);
21940                 } else
21941                         kunmap(miter->page);
21942 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
21943         if (!sg_miter_skip(&miter, skip))
21944                 return false;
21945
21946 -       local_irq_save(flags);
21947 +       local_irq_save_nort(flags);
21948
21949         while (sg_miter_next(&miter) && offset < buflen) {
21950                 unsigned int len;
21951 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
21952
21953         sg_miter_stop(&miter);
21954
21955 -       local_irq_restore(flags);
21956 +       local_irq_restore_nort(flags);
21957         return offset;
21958  }
21959  EXPORT_SYMBOL(sg_copy_buffer);
21960 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
21961 index 1afec32de6f2..11fa431046a8 100644
21962 --- a/lib/smp_processor_id.c
21963 +++ b/lib/smp_processor_id.c
21964 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
21965         if (!printk_ratelimit())
21966                 goto out_enable;
21967
21968 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
21969 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
21970 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
21971 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
21972 +               current->comm, current->pid);
21973
21974         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
21975         dump_stack();
21976 diff --git a/localversion-rt b/localversion-rt
21977 new file mode 100644
21978 index 000000000000..ad3da1bcab7e
21979 --- /dev/null
21980 +++ b/localversion-rt
21981 @@ -0,0 +1 @@
21982 +-rt4
21983 diff --git a/mm/Kconfig b/mm/Kconfig
21984 index 86e3e0e74d20..77e5862a1ed2 100644
21985 --- a/mm/Kconfig
21986 +++ b/mm/Kconfig
21987 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
21988
21989  config TRANSPARENT_HUGEPAGE
21990         bool "Transparent Hugepage Support"
21991 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
21992 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
21993         select COMPACTION
21994         select RADIX_TREE_MULTIORDER
21995         help
21996 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
21997 index 8fde443f36d7..d7a863b0ec20 100644
21998 --- a/mm/backing-dev.c
21999 +++ b/mm/backing-dev.c
22000 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
22001  {
22002         unsigned long flags;
22003
22004 -       local_irq_save(flags);
22005 +       local_irq_save_nort(flags);
22006         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
22007 -               local_irq_restore(flags);
22008 +               local_irq_restore_nort(flags);
22009                 return;
22010         }
22011
22012 diff --git a/mm/compaction.c b/mm/compaction.c
22013 index 70e6bec46dc2..6678ed58b7c6 100644
22014 --- a/mm/compaction.c
22015 +++ b/mm/compaction.c
22016 @@ -1593,10 +1593,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
22017                                 block_start_pfn(cc->migrate_pfn, cc->order);
22018
22019                         if (cc->last_migrated_pfn < current_block_start) {
22020 -                               cpu = get_cpu();
22021 +                               cpu = get_cpu_light();
22022 +                               local_lock_irq(swapvec_lock);
22023                                 lru_add_drain_cpu(cpu);
22024 +                               local_unlock_irq(swapvec_lock);
22025                                 drain_local_pages(zone);
22026 -                               put_cpu();
22027 +                               put_cpu_light();
22028                                 /* No more flushing until we migrate again */
22029                                 cc->last_migrated_pfn = 0;
22030                         }
22031 diff --git a/mm/filemap.c b/mm/filemap.c
22032 index 779801092ef1..554e1b4d0fc5 100644
22033 --- a/mm/filemap.c
22034 +++ b/mm/filemap.c
22035 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
22036                  * node->private_list is protected by
22037                  * mapping->tree_lock.
22038                  */
22039 -               if (!list_empty(&node->private_list))
22040 -                       list_lru_del(&workingset_shadow_nodes,
22041 +               if (!list_empty(&node->private_list)) {
22042 +                       local_lock(workingset_shadow_lock);
22043 +                       list_lru_del(&__workingset_shadow_nodes,
22044                                      &node->private_list);
22045 +                       local_unlock(workingset_shadow_lock);
22046 +               }
22047         }
22048         return 0;
22049  }
22050 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
22051                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
22052                                 list_empty(&node->private_list)) {
22053                         node->private_data = mapping;
22054 -                       list_lru_add(&workingset_shadow_nodes,
22055 -                                       &node->private_list);
22056 +                       local_lock(workingset_shadow_lock);
22057 +                       list_lru_add(&__workingset_shadow_nodes,
22058 +                                    &node->private_list);
22059 +                       local_unlock(workingset_shadow_lock);
22060                 }
22061         }
22062
22063 diff --git a/mm/highmem.c b/mm/highmem.c
22064 index 50b4ca6787f0..77518a3b35a1 100644
22065 --- a/mm/highmem.c
22066 +++ b/mm/highmem.c
22067 @@ -29,10 +29,11 @@
22068  #include <linux/kgdb.h>
22069  #include <asm/tlbflush.h>
22070
22071 -
22072 +#ifndef CONFIG_PREEMPT_RT_FULL
22073  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
22074  DEFINE_PER_CPU(int, __kmap_atomic_idx);
22075  #endif
22076 +#endif
22077
22078  /*
22079   * Virtual_count is not a pure "count".
22080 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
22081  unsigned long totalhigh_pages __read_mostly;
22082  EXPORT_SYMBOL(totalhigh_pages);
22083
22084 -
22085 +#ifndef CONFIG_PREEMPT_RT_FULL
22086  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
22087 +#endif
22088
22089  unsigned int nr_free_highpages (void)
22090  {
22091 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
22092 index d536a9daa511..70ac8827ee8c 100644
22093 --- a/mm/memcontrol.c
22094 +++ b/mm/memcontrol.c
22095 @@ -67,6 +67,7 @@
22096  #include <net/sock.h>
22097  #include <net/ip.h>
22098  #include "slab.h"
22099 +#include <linux/locallock.h>
22100
22101  #include <asm/uaccess.h>
22102
22103 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
22104  #define do_swap_account                0
22105  #endif
22106
22107 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
22108 +
22109  /* Whether legacy memory+swap accounting is active */
22110  static bool do_memsw_account(void)
22111  {
22112 @@ -1692,6 +1695,7 @@ struct memcg_stock_pcp {
22113  #define FLUSHING_CACHED_CHARGE 0
22114  };
22115  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
22116 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
22117  static DEFINE_MUTEX(percpu_charge_mutex);
22118
22119  /**
22120 @@ -1714,7 +1718,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
22121         if (nr_pages > CHARGE_BATCH)
22122                 return ret;
22123
22124 -       local_irq_save(flags);
22125 +       local_lock_irqsave(memcg_stock_ll, flags);
22126
22127         stock = this_cpu_ptr(&memcg_stock);
22128         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
22129 @@ -1722,7 +1726,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
22130                 ret = true;
22131         }
22132
22133 -       local_irq_restore(flags);
22134 +       local_unlock_irqrestore(memcg_stock_ll, flags);
22135
22136         return ret;
22137  }
22138 @@ -1749,13 +1753,13 @@ static void drain_local_stock(struct work_struct *dummy)
22139         struct memcg_stock_pcp *stock;
22140         unsigned long flags;
22141
22142 -       local_irq_save(flags);
22143 +       local_lock_irqsave(memcg_stock_ll, flags);
22144
22145         stock = this_cpu_ptr(&memcg_stock);
22146         drain_stock(stock);
22147         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
22148
22149 -       local_irq_restore(flags);
22150 +       local_unlock_irqrestore(memcg_stock_ll, flags);
22151  }
22152
22153  /*
22154 @@ -1767,7 +1771,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
22155         struct memcg_stock_pcp *stock;
22156         unsigned long flags;
22157
22158 -       local_irq_save(flags);
22159 +       local_lock_irqsave(memcg_stock_ll, flags);
22160
22161         stock = this_cpu_ptr(&memcg_stock);
22162         if (stock->cached != memcg) { /* reset if necessary */
22163 @@ -1776,7 +1780,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
22164         }
22165         stock->nr_pages += nr_pages;
22166
22167 -       local_irq_restore(flags);
22168 +       local_unlock_irqrestore(memcg_stock_ll, flags);
22169  }
22170
22171  /*
22172 @@ -1792,7 +1796,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
22173                 return;
22174         /* Notify other cpus that system-wide "drain" is running */
22175         get_online_cpus();
22176 -       curcpu = get_cpu();
22177 +       curcpu = get_cpu_light();
22178         for_each_online_cpu(cpu) {
22179                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
22180                 struct mem_cgroup *memcg;
22181 @@ -1809,7 +1813,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
22182                                 schedule_work_on(cpu, &stock->work);
22183                 }
22184         }
22185 -       put_cpu();
22186 +       put_cpu_light();
22187         put_online_cpus();
22188         mutex_unlock(&percpu_charge_mutex);
22189  }
22190 @@ -4548,12 +4552,12 @@ static int mem_cgroup_move_account(struct page *page,
22191
22192         ret = 0;
22193
22194 -       local_irq_disable();
22195 +       local_lock_irq(event_lock);
22196         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
22197         memcg_check_events(to, page);
22198         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
22199         memcg_check_events(from, page);
22200 -       local_irq_enable();
22201 +       local_unlock_irq(event_lock);
22202  out_unlock:
22203         unlock_page(page);
22204  out:
22205 @@ -5428,10 +5432,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
22206
22207         commit_charge(page, memcg, lrucare);
22208
22209 -       local_irq_disable();
22210 +       local_lock_irq(event_lock);
22211         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
22212         memcg_check_events(memcg, page);
22213 -       local_irq_enable();
22214 +       local_unlock_irq(event_lock);
22215
22216         if (do_memsw_account() && PageSwapCache(page)) {
22217                 swp_entry_t entry = { .val = page_private(page) };
22218 @@ -5487,14 +5491,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
22219                 memcg_oom_recover(memcg);
22220         }
22221
22222 -       local_irq_save(flags);
22223 +       local_lock_irqsave(event_lock, flags);
22224         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
22225         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
22226         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
22227         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
22228         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
22229         memcg_check_events(memcg, dummy_page);
22230 -       local_irq_restore(flags);
22231 +       local_unlock_irqrestore(event_lock, flags);
22232
22233         if (!mem_cgroup_is_root(memcg))
22234                 css_put_many(&memcg->css, nr_pages);
22235 @@ -5649,10 +5653,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
22236
22237         commit_charge(newpage, memcg, false);
22238
22239 -       local_irq_save(flags);
22240 +       local_lock_irqsave(event_lock, flags);
22241         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
22242         memcg_check_events(memcg, newpage);
22243 -       local_irq_restore(flags);
22244 +       local_unlock_irqrestore(event_lock, flags);
22245  }
22246
22247  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
22248 @@ -5832,6 +5836,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
22249  {
22250         struct mem_cgroup *memcg, *swap_memcg;
22251         unsigned short oldid;
22252 +       unsigned long flags;
22253
22254         VM_BUG_ON_PAGE(PageLRU(page), page);
22255         VM_BUG_ON_PAGE(page_count(page), page);
22256 @@ -5872,12 +5877,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
22257          * important here to have the interrupts disabled because it is the
22258          * only synchronisation we have for udpating the per-CPU variables.
22259          */
22260 +       local_lock_irqsave(event_lock, flags);
22261 +#ifndef CONFIG_PREEMPT_RT_BASE
22262         VM_BUG_ON(!irqs_disabled());
22263 +#endif
22264         mem_cgroup_charge_statistics(memcg, page, false, -1);
22265         memcg_check_events(memcg, page);
22266
22267         if (!mem_cgroup_is_root(memcg))
22268                 css_put(&memcg->css);
22269 +       local_unlock_irqrestore(event_lock, flags);
22270  }
22271
22272  /*
22273 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
22274 index 6f4d27c5bb32..5cd25c745a8f 100644
22275 --- a/mm/mmu_context.c
22276 +++ b/mm/mmu_context.c
22277 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
22278         struct task_struct *tsk = current;
22279
22280         task_lock(tsk);
22281 +       preempt_disable_rt();
22282         active_mm = tsk->active_mm;
22283         if (active_mm != mm) {
22284                 atomic_inc(&mm->mm_count);
22285 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
22286         }
22287         tsk->mm = mm;
22288         switch_mm(active_mm, mm, tsk);
22289 +       preempt_enable_rt();
22290         task_unlock(tsk);
22291  #ifdef finish_arch_post_lock_switch
22292         finish_arch_post_lock_switch();
22293 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
22294 index 34ada718ef47..21f0dc3fe2aa 100644
22295 --- a/mm/page_alloc.c
22296 +++ b/mm/page_alloc.c
22297 @@ -61,6 +61,7 @@
22298  #include <linux/page_ext.h>
22299  #include <linux/hugetlb.h>
22300  #include <linux/sched/rt.h>
22301 +#include <linux/locallock.h>
22302  #include <linux/page_owner.h>
22303  #include <linux/kthread.h>
22304  #include <linux/memcontrol.h>
22305 @@ -281,6 +282,18 @@ EXPORT_SYMBOL(nr_node_ids);
22306  EXPORT_SYMBOL(nr_online_nodes);
22307  #endif
22308
22309 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
22310 +
22311 +#ifdef CONFIG_PREEMPT_RT_BASE
22312 +# define cpu_lock_irqsave(cpu, flags)          \
22313 +       local_lock_irqsave_on(pa_lock, flags, cpu)
22314 +# define cpu_unlock_irqrestore(cpu, flags)     \
22315 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
22316 +#else
22317 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
22318 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
22319 +#endif
22320 +
22321  int page_group_by_mobility_disabled __read_mostly;
22322
22323  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
22324 @@ -1072,7 +1085,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
22325  #endif /* CONFIG_DEBUG_VM */
22326
22327  /*
22328 - * Frees a number of pages from the PCP lists
22329 + * Frees a number of pages which have been collected from the pcp lists.
22330   * Assumes all pages on list are in same zone, and of same order.
22331   * count is the number of pages to free.
22332   *
22333 @@ -1083,19 +1096,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
22334   * pinned" detection logic.
22335   */
22336  static void free_pcppages_bulk(struct zone *zone, int count,
22337 -                                       struct per_cpu_pages *pcp)
22338 +                              struct list_head *list)
22339  {
22340 -       int migratetype = 0;
22341 -       int batch_free = 0;
22342         unsigned long nr_scanned;
22343         bool isolated_pageblocks;
22344 +       unsigned long flags;
22345 +
22346 +       spin_lock_irqsave(&zone->lock, flags);
22347
22348 -       spin_lock(&zone->lock);
22349         isolated_pageblocks = has_isolate_pageblock(zone);
22350         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
22351         if (nr_scanned)
22352                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
22353
22354 +       while (!list_empty(list)) {
22355 +               struct page *page;
22356 +               int mt; /* migratetype of the to-be-freed page */
22357 +
22358 +               page = list_first_entry(list, struct page, lru);
22359 +               /* must delete as __free_one_page list manipulates */
22360 +               list_del(&page->lru);
22361 +
22362 +               mt = get_pcppage_migratetype(page);
22363 +               /* MIGRATE_ISOLATE page should not go to pcplists */
22364 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
22365 +               /* Pageblock could have been isolated meanwhile */
22366 +               if (unlikely(isolated_pageblocks))
22367 +                       mt = get_pageblock_migratetype(page);
22368 +
22369 +               if (bulkfree_pcp_prepare(page))
22370 +                       continue;
22371 +
22372 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
22373 +               trace_mm_page_pcpu_drain(page, 0, mt);
22374 +               count--;
22375 +       }
22376 +       WARN_ON(count != 0);
22377 +       spin_unlock_irqrestore(&zone->lock, flags);
22378 +}
22379 +
22380 +/*
22381 + * Moves a number of pages from the PCP lists to free list which
22382 + * is freed outside of the locked region.
22383 + *
22384 + * Assumes all pages on list are in same zone, and of same order.
22385 + * count is the number of pages to free.
22386 + */
22387 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
22388 +                             struct list_head *dst)
22389 +{
22390 +       int migratetype = 0;
22391 +       int batch_free = 0;
22392 +
22393         while (count) {
22394                 struct page *page;
22395                 struct list_head *list;
22396 @@ -1111,7 +1163,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
22397                         batch_free++;
22398                         if (++migratetype == MIGRATE_PCPTYPES)
22399                                 migratetype = 0;
22400 -                       list = &pcp->lists[migratetype];
22401 +                       list = &src->lists[migratetype];
22402                 } while (list_empty(list));
22403
22404                 /* This is the only non-empty list. Free them all. */
22405 @@ -1119,27 +1171,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
22406                         batch_free = count;
22407
22408                 do {
22409 -                       int mt; /* migratetype of the to-be-freed page */
22410 -
22411                         page = list_last_entry(list, struct page, lru);
22412 -                       /* must delete as __free_one_page list manipulates */
22413                         list_del(&page->lru);
22414
22415 -                       mt = get_pcppage_migratetype(page);
22416 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
22417 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
22418 -                       /* Pageblock could have been isolated meanwhile */
22419 -                       if (unlikely(isolated_pageblocks))
22420 -                               mt = get_pageblock_migratetype(page);
22421 -
22422 -                       if (bulkfree_pcp_prepare(page))
22423 -                               continue;
22424 -
22425 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
22426 -                       trace_mm_page_pcpu_drain(page, 0, mt);
22427 +                       list_add(&page->lru, dst);
22428                 } while (--count && --batch_free && !list_empty(list));
22429         }
22430 -       spin_unlock(&zone->lock);
22431  }
22432
22433  static void free_one_page(struct zone *zone,
22434 @@ -1148,7 +1185,9 @@ static void free_one_page(struct zone *zone,
22435                                 int migratetype)
22436  {
22437         unsigned long nr_scanned;
22438 -       spin_lock(&zone->lock);
22439 +       unsigned long flags;
22440 +
22441 +       spin_lock_irqsave(&zone->lock, flags);
22442         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
22443         if (nr_scanned)
22444                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
22445 @@ -1158,7 +1197,7 @@ static void free_one_page(struct zone *zone,
22446                 migratetype = get_pfnblock_migratetype(page, pfn);
22447         }
22448         __free_one_page(page, pfn, zone, order, migratetype);
22449 -       spin_unlock(&zone->lock);
22450 +       spin_unlock_irqrestore(&zone->lock, flags);
22451  }
22452
22453  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
22454 @@ -1244,10 +1283,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
22455                 return;
22456
22457         migratetype = get_pfnblock_migratetype(page, pfn);
22458 -       local_irq_save(flags);
22459 +       local_lock_irqsave(pa_lock, flags);
22460         __count_vm_events(PGFREE, 1 << order);
22461         free_one_page(page_zone(page), page, pfn, order, migratetype);
22462 -       local_irq_restore(flags);
22463 +       local_unlock_irqrestore(pa_lock, flags);
22464  }
22465
22466  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
22467 @@ -2246,16 +2285,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
22468  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
22469  {
22470         unsigned long flags;
22471 +       LIST_HEAD(dst);
22472         int to_drain, batch;
22473
22474 -       local_irq_save(flags);
22475 +       local_lock_irqsave(pa_lock, flags);
22476         batch = READ_ONCE(pcp->batch);
22477         to_drain = min(pcp->count, batch);
22478         if (to_drain > 0) {
22479 -               free_pcppages_bulk(zone, to_drain, pcp);
22480 +               isolate_pcp_pages(to_drain, pcp, &dst);
22481                 pcp->count -= to_drain;
22482         }
22483 -       local_irq_restore(flags);
22484 +       local_unlock_irqrestore(pa_lock, flags);
22485 +       free_pcppages_bulk(zone, to_drain, &dst);
22486  }
22487  #endif
22488
22489 @@ -2271,16 +2312,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
22490         unsigned long flags;
22491         struct per_cpu_pageset *pset;
22492         struct per_cpu_pages *pcp;
22493 +       LIST_HEAD(dst);
22494 +       int count;
22495
22496 -       local_irq_save(flags);
22497 +       cpu_lock_irqsave(cpu, flags);
22498         pset = per_cpu_ptr(zone->pageset, cpu);
22499
22500         pcp = &pset->pcp;
22501 -       if (pcp->count) {
22502 -               free_pcppages_bulk(zone, pcp->count, pcp);
22503 +       count = pcp->count;
22504 +       if (count) {
22505 +               isolate_pcp_pages(count, pcp, &dst);
22506                 pcp->count = 0;
22507         }
22508 -       local_irq_restore(flags);
22509 +       cpu_unlock_irqrestore(cpu, flags);
22510 +       if (count)
22511 +               free_pcppages_bulk(zone, count, &dst);
22512  }
22513
22514  /*
22515 @@ -2366,8 +2412,17 @@ void drain_all_pages(struct zone *zone)
22516                 else
22517                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
22518         }
22519 +#ifndef CONFIG_PREEMPT_RT_BASE
22520         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
22521                                                                 zone, 1);
22522 +#else
22523 +       for_each_cpu(cpu, &cpus_with_pcps) {
22524 +               if (zone)
22525 +                       drain_pages_zone(cpu, zone);
22526 +               else
22527 +                       drain_pages(cpu);
22528 +       }
22529 +#endif
22530  }
22531
22532  #ifdef CONFIG_HIBERNATION
22533 @@ -2427,7 +2482,7 @@ void free_hot_cold_page(struct page *page, bool cold)
22534
22535         migratetype = get_pfnblock_migratetype(page, pfn);
22536         set_pcppage_migratetype(page, migratetype);
22537 -       local_irq_save(flags);
22538 +       local_lock_irqsave(pa_lock, flags);
22539         __count_vm_event(PGFREE);
22540
22541         /*
22542 @@ -2453,12 +2508,17 @@ void free_hot_cold_page(struct page *page, bool cold)
22543         pcp->count++;
22544         if (pcp->count >= pcp->high) {
22545                 unsigned long batch = READ_ONCE(pcp->batch);
22546 -               free_pcppages_bulk(zone, batch, pcp);
22547 +               LIST_HEAD(dst);
22548 +
22549 +               isolate_pcp_pages(batch, pcp, &dst);
22550                 pcp->count -= batch;
22551 +               local_unlock_irqrestore(pa_lock, flags);
22552 +               free_pcppages_bulk(zone, batch, &dst);
22553 +               return;
22554         }
22555
22556  out:
22557 -       local_irq_restore(flags);
22558 +       local_unlock_irqrestore(pa_lock, flags);
22559  }
22560
22561  /*
22562 @@ -2600,7 +2660,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
22563                 struct per_cpu_pages *pcp;
22564                 struct list_head *list;
22565
22566 -               local_irq_save(flags);
22567 +               local_lock_irqsave(pa_lock, flags);
22568                 do {
22569                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
22570                         list = &pcp->lists[migratetype];
22571 @@ -2627,7 +2687,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
22572                  * allocate greater than order-1 page units with __GFP_NOFAIL.
22573                  */
22574                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
22575 -               spin_lock_irqsave(&zone->lock, flags);
22576 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
22577
22578                 do {
22579                         page = NULL;
22580 @@ -2639,22 +2699,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
22581                         if (!page)
22582                                 page = __rmqueue(zone, order, migratetype);
22583                 } while (page && check_new_pages(page, order));
22584 -               spin_unlock(&zone->lock);
22585 -               if (!page)
22586 +               if (!page) {
22587 +                       spin_unlock(&zone->lock);
22588                         goto failed;
22589 +               }
22590                 __mod_zone_freepage_state(zone, -(1 << order),
22591                                           get_pcppage_migratetype(page));
22592 +               spin_unlock(&zone->lock);
22593         }
22594
22595         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
22596         zone_statistics(preferred_zone, zone, gfp_flags);
22597 -       local_irq_restore(flags);
22598 +       local_unlock_irqrestore(pa_lock, flags);
22599
22600         VM_BUG_ON_PAGE(bad_range(zone, page), page);
22601         return page;
22602
22603  failed:
22604 -       local_irq_restore(flags);
22605 +       local_unlock_irqrestore(pa_lock, flags);
22606         return NULL;
22607  }
22608
22609 @@ -6505,7 +6567,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
22610         int cpu = (unsigned long)hcpu;
22611
22612         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
22613 +               local_lock_irq_on(swapvec_lock, cpu);
22614                 lru_add_drain_cpu(cpu);
22615 +               local_unlock_irq_on(swapvec_lock, cpu);
22616                 drain_pages(cpu);
22617
22618                 /*
22619 @@ -6531,6 +6595,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
22620  void __init page_alloc_init(void)
22621  {
22622         hotcpu_notifier(page_alloc_cpu_notify, 0);
22623 +       local_irq_lock_init(pa_lock);
22624  }
22625
22626  /*
22627 @@ -7359,7 +7424,7 @@ void zone_pcp_reset(struct zone *zone)
22628         struct per_cpu_pageset *pset;
22629
22630         /* avoid races with drain_pages()  */
22631 -       local_irq_save(flags);
22632 +       local_lock_irqsave(pa_lock, flags);
22633         if (zone->pageset != &boot_pageset) {
22634                 for_each_online_cpu(cpu) {
22635                         pset = per_cpu_ptr(zone->pageset, cpu);
22636 @@ -7368,7 +7433,7 @@ void zone_pcp_reset(struct zone *zone)
22637                 free_percpu(zone->pageset);
22638                 zone->pageset = &boot_pageset;
22639         }
22640 -       local_irq_restore(flags);
22641 +       local_unlock_irqrestore(pa_lock, flags);
22642  }
22643
22644  #ifdef CONFIG_MEMORY_HOTREMOVE
22645 diff --git a/mm/slab.h b/mm/slab.h
22646 index bc05fdc3edce..610cf61634f0 100644
22647 --- a/mm/slab.h
22648 +++ b/mm/slab.h
22649 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
22650   * The slab lists for all objects.
22651   */
22652  struct kmem_cache_node {
22653 +#ifdef CONFIG_SLUB
22654 +       raw_spinlock_t list_lock;
22655 +#else
22656         spinlock_t list_lock;
22657 +#endif
22658
22659  #ifdef CONFIG_SLAB
22660         struct list_head slabs_partial; /* partial list first, better asm code */
22661 diff --git a/mm/slub.c b/mm/slub.c
22662 index 2b3e740609e9..1732f9c5d31f 100644
22663 --- a/mm/slub.c
22664 +++ b/mm/slub.c
22665 @@ -1141,7 +1141,7 @@ static noinline int free_debug_processing(
22666         unsigned long uninitialized_var(flags);
22667         int ret = 0;
22668
22669 -       spin_lock_irqsave(&n->list_lock, flags);
22670 +       raw_spin_lock_irqsave(&n->list_lock, flags);
22671         slab_lock(page);
22672
22673         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
22674 @@ -1176,7 +1176,7 @@ static noinline int free_debug_processing(
22675                          bulk_cnt, cnt);
22676
22677         slab_unlock(page);
22678 -       spin_unlock_irqrestore(&n->list_lock, flags);
22679 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22680         if (!ret)
22681                 slab_fix(s, "Object at 0x%p not freed", object);
22682         return ret;
22683 @@ -1304,6 +1304,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
22684
22685  #endif /* CONFIG_SLUB_DEBUG */
22686
22687 +struct slub_free_list {
22688 +       raw_spinlock_t          lock;
22689 +       struct list_head        list;
22690 +};
22691 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
22692 +
22693  /*
22694   * Hooks for other subsystems that check memory allocations. In a typical
22695   * production configuration these hooks all should produce no code at all.
22696 @@ -1523,10 +1529,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
22697         void *start, *p;
22698         int idx, order;
22699         bool shuffle;
22700 +       bool enableirqs = false;
22701
22702         flags &= gfp_allowed_mask;
22703
22704         if (gfpflags_allow_blocking(flags))
22705 +               enableirqs = true;
22706 +#ifdef CONFIG_PREEMPT_RT_FULL
22707 +       if (system_state == SYSTEM_RUNNING)
22708 +               enableirqs = true;
22709 +#endif
22710 +       if (enableirqs)
22711                 local_irq_enable();
22712
22713         flags |= s->allocflags;
22714 @@ -1601,7 +1614,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
22715         page->frozen = 1;
22716
22717  out:
22718 -       if (gfpflags_allow_blocking(flags))
22719 +       if (enableirqs)
22720                 local_irq_disable();
22721         if (!page)
22722                 return NULL;
22723 @@ -1660,6 +1673,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
22724         __free_pages(page, order);
22725  }
22726
22727 +static void free_delayed(struct list_head *h)
22728 +{
22729 +       while(!list_empty(h)) {
22730 +               struct page *page = list_first_entry(h, struct page, lru);
22731 +
22732 +               list_del(&page->lru);
22733 +               __free_slab(page->slab_cache, page);
22734 +       }
22735 +}
22736 +
22737  #define need_reserve_slab_rcu                                          \
22738         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
22739
22740 @@ -1691,6 +1714,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
22741                 }
22742
22743                 call_rcu(head, rcu_free_slab);
22744 +       } else if (irqs_disabled()) {
22745 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
22746 +
22747 +               raw_spin_lock(&f->lock);
22748 +               list_add(&page->lru, &f->list);
22749 +               raw_spin_unlock(&f->lock);
22750         } else
22751                 __free_slab(s, page);
22752  }
22753 @@ -1798,7 +1827,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
22754         if (!n || !n->nr_partial)
22755                 return NULL;
22756
22757 -       spin_lock(&n->list_lock);
22758 +       raw_spin_lock(&n->list_lock);
22759         list_for_each_entry_safe(page, page2, &n->partial, lru) {
22760                 void *t;
22761
22762 @@ -1823,7 +1852,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
22763                         break;
22764
22765         }
22766 -       spin_unlock(&n->list_lock);
22767 +       raw_spin_unlock(&n->list_lock);
22768         return object;
22769  }
22770
22771 @@ -2069,7 +2098,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
22772                          * that acquire_slab() will see a slab page that
22773                          * is frozen
22774                          */
22775 -                       spin_lock(&n->list_lock);
22776 +                       raw_spin_lock(&n->list_lock);
22777                 }
22778         } else {
22779                 m = M_FULL;
22780 @@ -2080,7 +2109,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
22781                          * slabs from diagnostic functions will not see
22782                          * any frozen slabs.
22783                          */
22784 -                       spin_lock(&n->list_lock);
22785 +                       raw_spin_lock(&n->list_lock);
22786                 }
22787         }
22788
22789 @@ -2115,7 +2144,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
22790                 goto redo;
22791
22792         if (lock)
22793 -               spin_unlock(&n->list_lock);
22794 +               raw_spin_unlock(&n->list_lock);
22795
22796         if (m == M_FREE) {
22797                 stat(s, DEACTIVATE_EMPTY);
22798 @@ -2147,10 +2176,10 @@ static void unfreeze_partials(struct kmem_cache *s,
22799                 n2 = get_node(s, page_to_nid(page));
22800                 if (n != n2) {
22801                         if (n)
22802 -                               spin_unlock(&n->list_lock);
22803 +                               raw_spin_unlock(&n->list_lock);
22804
22805                         n = n2;
22806 -                       spin_lock(&n->list_lock);
22807 +                       raw_spin_lock(&n->list_lock);
22808                 }
22809
22810                 do {
22811 @@ -2179,7 +2208,7 @@ static void unfreeze_partials(struct kmem_cache *s,
22812         }
22813
22814         if (n)
22815 -               spin_unlock(&n->list_lock);
22816 +               raw_spin_unlock(&n->list_lock);
22817
22818         while (discard_page) {
22819                 page = discard_page;
22820 @@ -2218,14 +2247,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
22821                         pobjects = oldpage->pobjects;
22822                         pages = oldpage->pages;
22823                         if (drain && pobjects > s->cpu_partial) {
22824 +                               struct slub_free_list *f;
22825                                 unsigned long flags;
22826 +                               LIST_HEAD(tofree);
22827                                 /*
22828                                  * partial array is full. Move the existing
22829                                  * set to the per node partial list.
22830                                  */
22831                                 local_irq_save(flags);
22832                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
22833 +                               f = this_cpu_ptr(&slub_free_list);
22834 +                               raw_spin_lock(&f->lock);
22835 +                               list_splice_init(&f->list, &tofree);
22836 +                               raw_spin_unlock(&f->lock);
22837                                 local_irq_restore(flags);
22838 +                               free_delayed(&tofree);
22839                                 oldpage = NULL;
22840                                 pobjects = 0;
22841                                 pages = 0;
22842 @@ -2297,7 +2333,22 @@ static bool has_cpu_slab(int cpu, void *info)
22843
22844  static void flush_all(struct kmem_cache *s)
22845  {
22846 +       LIST_HEAD(tofree);
22847 +       int cpu;
22848 +
22849         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
22850 +       for_each_online_cpu(cpu) {
22851 +               struct slub_free_list *f;
22852 +
22853 +               if (!has_cpu_slab(cpu, s))
22854 +                       continue;
22855 +
22856 +               f = &per_cpu(slub_free_list, cpu);
22857 +               raw_spin_lock_irq(&f->lock);
22858 +               list_splice_init(&f->list, &tofree);
22859 +               raw_spin_unlock_irq(&f->lock);
22860 +               free_delayed(&tofree);
22861 +       }
22862  }
22863
22864  /*
22865 @@ -2352,10 +2403,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
22866         unsigned long x = 0;
22867         struct page *page;
22868
22869 -       spin_lock_irqsave(&n->list_lock, flags);
22870 +       raw_spin_lock_irqsave(&n->list_lock, flags);
22871         list_for_each_entry(page, &n->partial, lru)
22872                 x += get_count(page);
22873 -       spin_unlock_irqrestore(&n->list_lock, flags);
22874 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22875         return x;
22876  }
22877  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
22878 @@ -2493,8 +2544,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
22879   * already disabled (which is the case for bulk allocation).
22880   */
22881  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22882 -                         unsigned long addr, struct kmem_cache_cpu *c)
22883 +                         unsigned long addr, struct kmem_cache_cpu *c,
22884 +                         struct list_head *to_free)
22885  {
22886 +       struct slub_free_list *f;
22887         void *freelist;
22888         struct page *page;
22889
22890 @@ -2554,6 +2607,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22891         VM_BUG_ON(!c->page->frozen);
22892         c->freelist = get_freepointer(s, freelist);
22893         c->tid = next_tid(c->tid);
22894 +
22895 +out:
22896 +       f = this_cpu_ptr(&slub_free_list);
22897 +       raw_spin_lock(&f->lock);
22898 +       list_splice_init(&f->list, to_free);
22899 +       raw_spin_unlock(&f->lock);
22900 +
22901         return freelist;
22902
22903  new_slab:
22904 @@ -2585,7 +2645,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22905         deactivate_slab(s, page, get_freepointer(s, freelist));
22906         c->page = NULL;
22907         c->freelist = NULL;
22908 -       return freelist;
22909 +       goto out;
22910  }
22911
22912  /*
22913 @@ -2597,6 +2657,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22914  {
22915         void *p;
22916         unsigned long flags;
22917 +       LIST_HEAD(tofree);
22918
22919         local_irq_save(flags);
22920  #ifdef CONFIG_PREEMPT
22921 @@ -2608,8 +2669,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
22922         c = this_cpu_ptr(s->cpu_slab);
22923  #endif
22924
22925 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
22926 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
22927         local_irq_restore(flags);
22928 +       free_delayed(&tofree);
22929         return p;
22930  }
22931
22932 @@ -2795,7 +2857,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22933
22934         do {
22935                 if (unlikely(n)) {
22936 -                       spin_unlock_irqrestore(&n->list_lock, flags);
22937 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22938                         n = NULL;
22939                 }
22940                 prior = page->freelist;
22941 @@ -2827,7 +2889,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22942                                  * Otherwise the list_lock will synchronize with
22943                                  * other processors updating the list of slabs.
22944                                  */
22945 -                               spin_lock_irqsave(&n->list_lock, flags);
22946 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
22947
22948                         }
22949                 }
22950 @@ -2869,7 +2931,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22951                 add_partial(n, page, DEACTIVATE_TO_TAIL);
22952                 stat(s, FREE_ADD_PARTIAL);
22953         }
22954 -       spin_unlock_irqrestore(&n->list_lock, flags);
22955 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22956         return;
22957
22958  slab_empty:
22959 @@ -2884,7 +2946,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
22960                 remove_full(s, n, page);
22961         }
22962
22963 -       spin_unlock_irqrestore(&n->list_lock, flags);
22964 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
22965         stat(s, FREE_SLAB);
22966         discard_slab(s, page);
22967  }
22968 @@ -3089,6 +3151,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
22969                           void **p)
22970  {
22971         struct kmem_cache_cpu *c;
22972 +       LIST_HEAD(to_free);
22973         int i;
22974
22975         /* memcg and kmem_cache debug support */
22976 @@ -3112,7 +3175,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
22977                          * of re-populating per CPU c->freelist
22978                          */
22979                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
22980 -                                           _RET_IP_, c);
22981 +                                           _RET_IP_, c, &to_free);
22982                         if (unlikely(!p[i]))
22983                                 goto error;
22984
22985 @@ -3124,6 +3187,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
22986         }
22987         c->tid = next_tid(c->tid);
22988         local_irq_enable();
22989 +       free_delayed(&to_free);
22990
22991         /* Clear memory outside IRQ disabled fastpath loop */
22992         if (unlikely(flags & __GFP_ZERO)) {
22993 @@ -3271,7 +3335,7 @@ static void
22994  init_kmem_cache_node(struct kmem_cache_node *n)
22995  {
22996         n->nr_partial = 0;
22997 -       spin_lock_init(&n->list_lock);
22998 +       raw_spin_lock_init(&n->list_lock);
22999         INIT_LIST_HEAD(&n->partial);
23000  #ifdef CONFIG_SLUB_DEBUG
23001         atomic_long_set(&n->nr_slabs, 0);
23002 @@ -3615,6 +3679,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
23003                                                         const char *text)
23004  {
23005  #ifdef CONFIG_SLUB_DEBUG
23006 +#ifdef CONFIG_PREEMPT_RT_BASE
23007 +       /* XXX move out of irq-off section */
23008 +       slab_err(s, page, text, s->name);
23009 +#else
23010         void *addr = page_address(page);
23011         void *p;
23012         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
23013 @@ -3635,6 +3703,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
23014         slab_unlock(page);
23015         kfree(map);
23016  #endif
23017 +#endif
23018  }
23019
23020  /*
23021 @@ -3648,7 +3717,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
23022         struct page *page, *h;
23023
23024         BUG_ON(irqs_disabled());
23025 -       spin_lock_irq(&n->list_lock);
23026 +       raw_spin_lock_irq(&n->list_lock);
23027         list_for_each_entry_safe(page, h, &n->partial, lru) {
23028                 if (!page->inuse) {
23029                         remove_partial(n, page);
23030 @@ -3658,7 +3727,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
23031                         "Objects remaining in %s on __kmem_cache_shutdown()");
23032                 }
23033         }
23034 -       spin_unlock_irq(&n->list_lock);
23035 +       raw_spin_unlock_irq(&n->list_lock);
23036
23037         list_for_each_entry_safe(page, h, &discard, lru)
23038                 discard_slab(s, page);
23039 @@ -3916,7 +3985,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
23040                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
23041                         INIT_LIST_HEAD(promote + i);
23042
23043 -               spin_lock_irqsave(&n->list_lock, flags);
23044 +               raw_spin_lock_irqsave(&n->list_lock, flags);
23045
23046                 /*
23047                  * Build lists of slabs to discard or promote.
23048 @@ -3947,7 +4016,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
23049                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
23050                         list_splice(promote + i, &n->partial);
23051
23052 -               spin_unlock_irqrestore(&n->list_lock, flags);
23053 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
23054
23055                 /* Release empty slabs */
23056                 list_for_each_entry_safe(page, t, &discard, lru)
23057 @@ -4123,6 +4192,12 @@ void __init kmem_cache_init(void)
23058  {
23059         static __initdata struct kmem_cache boot_kmem_cache,
23060                 boot_kmem_cache_node;
23061 +       int cpu;
23062 +
23063 +       for_each_possible_cpu(cpu) {
23064 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
23065 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
23066 +       }
23067
23068         if (debug_guardpage_minorder())
23069                 slub_max_order = 0;
23070 @@ -4331,7 +4406,7 @@ static int validate_slab_node(struct kmem_cache *s,
23071         struct page *page;
23072         unsigned long flags;
23073
23074 -       spin_lock_irqsave(&n->list_lock, flags);
23075 +       raw_spin_lock_irqsave(&n->list_lock, flags);
23076
23077         list_for_each_entry(page, &n->partial, lru) {
23078                 validate_slab_slab(s, page, map);
23079 @@ -4353,7 +4428,7 @@ static int validate_slab_node(struct kmem_cache *s,
23080                        s->name, count, atomic_long_read(&n->nr_slabs));
23081
23082  out:
23083 -       spin_unlock_irqrestore(&n->list_lock, flags);
23084 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
23085         return count;
23086  }
23087
23088 @@ -4541,12 +4616,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
23089                 if (!atomic_long_read(&n->nr_slabs))
23090                         continue;
23091
23092 -               spin_lock_irqsave(&n->list_lock, flags);
23093 +               raw_spin_lock_irqsave(&n->list_lock, flags);
23094                 list_for_each_entry(page, &n->partial, lru)
23095                         process_slab(&t, s, page, alloc, map);
23096                 list_for_each_entry(page, &n->full, lru)
23097                         process_slab(&t, s, page, alloc, map);
23098 -               spin_unlock_irqrestore(&n->list_lock, flags);
23099 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
23100         }
23101
23102         for (i = 0; i < t.count; i++) {
23103 diff --git a/mm/swap.c b/mm/swap.c
23104 index 4dcf852e1e6d..69c3a5b24060 100644
23105 --- a/mm/swap.c
23106 +++ b/mm/swap.c
23107 @@ -32,6 +32,7 @@
23108  #include <linux/memcontrol.h>
23109  #include <linux/gfp.h>
23110  #include <linux/uio.h>
23111 +#include <linux/locallock.h>
23112  #include <linux/hugetlb.h>
23113  #include <linux/page_idle.h>
23114
23115 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
23116  #ifdef CONFIG_SMP
23117  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
23118  #endif
23119 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
23120 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
23121
23122  /*
23123   * This path almost never happens for VM activity - pages are normally
23124 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
23125                 unsigned long flags;
23126
23127                 get_page(page);
23128 -               local_irq_save(flags);
23129 +               local_lock_irqsave(rotate_lock, flags);
23130                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
23131                 if (!pagevec_add(pvec, page) || PageCompound(page))
23132                         pagevec_move_tail(pvec);
23133 -               local_irq_restore(flags);
23134 +               local_unlock_irqrestore(rotate_lock, flags);
23135         }
23136  }
23137
23138 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
23139  {
23140         page = compound_head(page);
23141         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
23142 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
23143 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
23144 +                                                      activate_page_pvecs);
23145
23146                 get_page(page);
23147                 if (!pagevec_add(pvec, page) || PageCompound(page))
23148                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
23149 -               put_cpu_var(activate_page_pvecs);
23150 +               put_locked_var(swapvec_lock, activate_page_pvecs);
23151         }
23152  }
23153
23154 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
23155
23156  static void __lru_cache_activate_page(struct page *page)
23157  {
23158 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
23159 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
23160         int i;
23161
23162         /*
23163 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
23164                 }
23165         }
23166
23167 -       put_cpu_var(lru_add_pvec);
23168 +       put_locked_var(swapvec_lock, lru_add_pvec);
23169  }
23170
23171  /*
23172 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
23173
23174  static void __lru_cache_add(struct page *page)
23175  {
23176 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
23177 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
23178
23179         get_page(page);
23180         if (!pagevec_add(pvec, page) || PageCompound(page))
23181                 __pagevec_lru_add(pvec);
23182 -       put_cpu_var(lru_add_pvec);
23183 +       put_locked_var(swapvec_lock, lru_add_pvec);
23184  }
23185
23186  /**
23187 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
23188                 unsigned long flags;
23189
23190                 /* No harm done if a racing interrupt already did this */
23191 -               local_irq_save(flags);
23192 +#ifdef CONFIG_PREEMPT_RT_BASE
23193 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
23194                 pagevec_move_tail(pvec);
23195 -               local_irq_restore(flags);
23196 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
23197 +#else
23198 +               local_lock_irqsave(rotate_lock, flags);
23199 +               pagevec_move_tail(pvec);
23200 +               local_unlock_irqrestore(rotate_lock, flags);
23201 +#endif
23202         }
23203
23204         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
23205 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
23206                 return;
23207
23208         if (likely(get_page_unless_zero(page))) {
23209 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
23210 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
23211 +                                                      lru_deactivate_file_pvecs);
23212
23213                 if (!pagevec_add(pvec, page) || PageCompound(page))
23214                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
23215 -               put_cpu_var(lru_deactivate_file_pvecs);
23216 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
23217         }
23218  }
23219
23220 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
23221  void deactivate_page(struct page *page)
23222  {
23223         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
23224 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
23225 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
23226 +                                                      lru_deactivate_pvecs);
23227
23228                 get_page(page);
23229                 if (!pagevec_add(pvec, page) || PageCompound(page))
23230                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
23231 -               put_cpu_var(lru_deactivate_pvecs);
23232 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
23233         }
23234  }
23235
23236  void lru_add_drain(void)
23237  {
23238 -       lru_add_drain_cpu(get_cpu());
23239 -       put_cpu();
23240 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
23241 +       local_unlock_cpu(swapvec_lock);
23242  }
23243
23244 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
23245 +#ifdef CONFIG_PREEMPT_RT_BASE
23246 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
23247  {
23248 -       lru_add_drain();
23249 +       local_lock_on(swapvec_lock, cpu);
23250 +       lru_add_drain_cpu(cpu);
23251 +       local_unlock_on(swapvec_lock, cpu);
23252  }
23253
23254 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
23255 +#else
23256
23257  /*
23258   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
23259 @@ -686,6 +701,22 @@ static int __init lru_init(void)
23260  }
23261  early_initcall(lru_init);
23262
23263 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
23264 +{
23265 +       lru_add_drain();
23266 +}
23267 +
23268 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
23269 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
23270 +{
23271 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
23272 +
23273 +       INIT_WORK(work, lru_add_drain_per_cpu);
23274 +       queue_work_on(cpu, lru_add_drain_wq, work);
23275 +       cpumask_set_cpu(cpu, has_work);
23276 +}
23277 +#endif
23278 +
23279  void lru_add_drain_all(void)
23280  {
23281         static DEFINE_MUTEX(lock);
23282 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
23283         cpumask_clear(&has_work);
23284
23285         for_each_online_cpu(cpu) {
23286 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
23287 -
23288                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
23289                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
23290                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
23291                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
23292 -                   need_activate_page_drain(cpu)) {
23293 -                       INIT_WORK(work, lru_add_drain_per_cpu);
23294 -                       queue_work_on(cpu, lru_add_drain_wq, work);
23295 -                       cpumask_set_cpu(cpu, &has_work);
23296 -               }
23297 +                   need_activate_page_drain(cpu))
23298 +                       remote_lru_add_drain(cpu, &has_work);
23299         }
23300
23301 +#ifndef CONFIG_PREEMPT_RT_BASE
23302         for_each_cpu(cpu, &has_work)
23303                 flush_work(&per_cpu(lru_add_drain_work, cpu));
23304 +#endif
23305
23306         put_online_cpus();
23307         mutex_unlock(&lock);
23308 diff --git a/mm/truncate.c b/mm/truncate.c
23309 index 8d8c62d89e6d..5bf1bd25d077 100644
23310 --- a/mm/truncate.c
23311 +++ b/mm/truncate.c
23312 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
23313          * protected by mapping->tree_lock.
23314          */
23315         if (!workingset_node_shadows(node) &&
23316 -           !list_empty(&node->private_list))
23317 -               list_lru_del(&workingset_shadow_nodes,
23318 +           !list_empty(&node->private_list)) {
23319 +               local_lock(workingset_shadow_lock);
23320 +               list_lru_del(&__workingset_shadow_nodes,
23321                                 &node->private_list);
23322 +               local_unlock(workingset_shadow_lock);
23323 +       }
23324         __radix_tree_delete_node(&mapping->page_tree, node);
23325  unlock:
23326         spin_unlock_irq(&mapping->tree_lock);
23327 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
23328 index f2481cb4e6b2..db4de08fa97c 100644
23329 --- a/mm/vmalloc.c
23330 +++ b/mm/vmalloc.c
23331 @@ -845,7 +845,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
23332         struct vmap_block *vb;
23333         struct vmap_area *va;
23334         unsigned long vb_idx;
23335 -       int node, err;
23336 +       int node, err, cpu;
23337         void *vaddr;
23338
23339         node = numa_node_id();
23340 @@ -888,11 +888,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
23341         BUG_ON(err);
23342         radix_tree_preload_end();
23343
23344 -       vbq = &get_cpu_var(vmap_block_queue);
23345 +       cpu = get_cpu_light();
23346 +       vbq = this_cpu_ptr(&vmap_block_queue);
23347         spin_lock(&vbq->lock);
23348         list_add_tail_rcu(&vb->free_list, &vbq->free);
23349         spin_unlock(&vbq->lock);
23350 -       put_cpu_var(vmap_block_queue);
23351 +       put_cpu_light();
23352
23353         return vaddr;
23354  }
23355 @@ -961,6 +962,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
23356         struct vmap_block *vb;
23357         void *vaddr = NULL;
23358         unsigned int order;
23359 +       int cpu;
23360
23361         BUG_ON(offset_in_page(size));
23362         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
23363 @@ -975,7 +977,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
23364         order = get_order(size);
23365
23366         rcu_read_lock();
23367 -       vbq = &get_cpu_var(vmap_block_queue);
23368 +       cpu = get_cpu_light();
23369 +       vbq = this_cpu_ptr(&vmap_block_queue);
23370         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
23371                 unsigned long pages_off;
23372
23373 @@ -998,7 +1001,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
23374                 break;
23375         }
23376
23377 -       put_cpu_var(vmap_block_queue);
23378 +       put_cpu_light();
23379         rcu_read_unlock();
23380
23381         /* Allocate new block if nothing was found */
23382 diff --git a/mm/vmstat.c b/mm/vmstat.c
23383 index 604f26a4f696..312006d2db50 100644
23384 --- a/mm/vmstat.c
23385 +++ b/mm/vmstat.c
23386 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
23387         long x;
23388         long t;
23389
23390 +       preempt_disable_rt();
23391         x = delta + __this_cpu_read(*p);
23392
23393         t = __this_cpu_read(pcp->stat_threshold);
23394 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
23395                 x = 0;
23396         }
23397         __this_cpu_write(*p, x);
23398 +       preempt_enable_rt();
23399  }
23400  EXPORT_SYMBOL(__mod_zone_page_state);
23401
23402 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
23403         long x;
23404         long t;
23405
23406 +       preempt_disable_rt();
23407         x = delta + __this_cpu_read(*p);
23408
23409         t = __this_cpu_read(pcp->stat_threshold);
23410 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
23411                 x = 0;
23412         }
23413         __this_cpu_write(*p, x);
23414 +       preempt_enable_rt();
23415  }
23416  EXPORT_SYMBOL(__mod_node_page_state);
23417
23418 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
23419         s8 __percpu *p = pcp->vm_stat_diff + item;
23420         s8 v, t;
23421
23422 +       preempt_disable_rt();
23423         v = __this_cpu_inc_return(*p);
23424         t = __this_cpu_read(pcp->stat_threshold);
23425         if (unlikely(v > t)) {
23426 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
23427                 zone_page_state_add(v + overstep, zone, item);
23428                 __this_cpu_write(*p, -overstep);
23429         }
23430 +       preempt_enable_rt();
23431  }
23432
23433  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
23434 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
23435         s8 __percpu *p = pcp->vm_node_stat_diff + item;
23436         s8 v, t;
23437
23438 +       preempt_disable_rt();
23439         v = __this_cpu_inc_return(*p);
23440         t = __this_cpu_read(pcp->stat_threshold);
23441         if (unlikely(v > t)) {
23442 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
23443                 node_page_state_add(v + overstep, pgdat, item);
23444                 __this_cpu_write(*p, -overstep);
23445         }
23446 +       preempt_enable_rt();
23447  }
23448
23449  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
23450 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
23451         s8 __percpu *p = pcp->vm_stat_diff + item;
23452         s8 v, t;
23453
23454 +       preempt_disable_rt();
23455         v = __this_cpu_dec_return(*p);
23456         t = __this_cpu_read(pcp->stat_threshold);
23457         if (unlikely(v < - t)) {
23458 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
23459                 zone_page_state_add(v - overstep, zone, item);
23460                 __this_cpu_write(*p, overstep);
23461         }
23462 +       preempt_enable_rt();
23463  }
23464
23465  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
23466 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
23467         s8 __percpu *p = pcp->vm_node_stat_diff + item;
23468         s8 v, t;
23469
23470 +       preempt_disable_rt();
23471         v = __this_cpu_dec_return(*p);
23472         t = __this_cpu_read(pcp->stat_threshold);
23473         if (unlikely(v < - t)) {
23474 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
23475                 node_page_state_add(v - overstep, pgdat, item);
23476                 __this_cpu_write(*p, overstep);
23477         }
23478 +       preempt_enable_rt();
23479  }
23480
23481  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
23482 diff --git a/mm/workingset.c b/mm/workingset.c
23483 index fb1f9183d89a..7e6ef1a48cd3 100644
23484 --- a/mm/workingset.c
23485 +++ b/mm/workingset.c
23486 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
23487   * point where they would still be useful.
23488   */
23489
23490 -struct list_lru workingset_shadow_nodes;
23491 +struct list_lru __workingset_shadow_nodes;
23492 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
23493
23494  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
23495                                         struct shrink_control *sc)
23496 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
23497         unsigned long pages;
23498
23499         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
23500 -       local_irq_disable();
23501 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
23502 -       local_irq_enable();
23503 +       local_lock_irq(workingset_shadow_lock);
23504 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
23505 +       local_unlock_irq(workingset_shadow_lock);
23506
23507         if (sc->memcg) {
23508                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
23509 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
23510         spin_unlock(&mapping->tree_lock);
23511         ret = LRU_REMOVED_RETRY;
23512  out:
23513 -       local_irq_enable();
23514 +       local_unlock_irq(workingset_shadow_lock);
23515         cond_resched();
23516 -       local_irq_disable();
23517 +       local_lock_irq(workingset_shadow_lock);
23518         spin_lock(lru_lock);
23519         return ret;
23520  }
23521 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
23522         unsigned long ret;
23523
23524         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
23525 -       local_irq_disable();
23526 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
23527 +       local_lock_irq(workingset_shadow_lock);
23528 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
23529                                     shadow_lru_isolate, NULL);
23530 -       local_irq_enable();
23531 +       local_unlock_irq(workingset_shadow_lock);
23532         return ret;
23533  }
23534
23535 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
23536         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
23537                timestamp_bits, max_order, bucket_order);
23538
23539 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
23540 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
23541         if (ret)
23542                 goto err;
23543         ret = register_shrinker(&workingset_shadow_shrinker);
23544 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
23545                 goto err_list_lru;
23546         return 0;
23547  err_list_lru:
23548 -       list_lru_destroy(&workingset_shadow_nodes);
23549 +       list_lru_destroy(&__workingset_shadow_nodes);
23550  err:
23551         return ret;
23552  }
23553 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
23554 index b0bc023d25c5..5af6426fbcbe 100644
23555 --- a/mm/zsmalloc.c
23556 +++ b/mm/zsmalloc.c
23557 @@ -53,6 +53,7 @@
23558  #include <linux/mount.h>
23559  #include <linux/migrate.h>
23560  #include <linux/pagemap.h>
23561 +#include <linux/locallock.h>
23562
23563  #define ZSPAGE_MAGIC   0x58
23564
23565 @@ -70,9 +71,22 @@
23566   */
23567  #define ZS_MAX_ZSPAGE_ORDER 2
23568  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
23569 -
23570  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
23571
23572 +#ifdef CONFIG_PREEMPT_RT_FULL
23573 +
23574 +struct zsmalloc_handle {
23575 +       unsigned long addr;
23576 +       struct mutex lock;
23577 +};
23578 +
23579 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
23580 +
23581 +#else
23582 +
23583 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
23584 +#endif
23585 +
23586  /*
23587   * Object location (<PFN>, <obj_idx>) is encoded as
23588   * as single (unsigned long) handle value.
23589 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
23590
23591  static int create_cache(struct zs_pool *pool)
23592  {
23593 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
23594 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
23595                                         0, 0, NULL);
23596         if (!pool->handle_cachep)
23597                 return 1;
23598 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
23599
23600  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
23601  {
23602 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
23603 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
23604 +       void *p;
23605 +
23606 +       p = kmem_cache_alloc(pool->handle_cachep,
23607 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
23608 +#ifdef CONFIG_PREEMPT_RT_FULL
23609 +       if (p) {
23610 +               struct zsmalloc_handle *zh = p;
23611 +
23612 +               mutex_init(&zh->lock);
23613 +       }
23614 +#endif
23615 +       return (unsigned long)p;
23616  }
23617
23618 +#ifdef CONFIG_PREEMPT_RT_FULL
23619 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
23620 +{
23621 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
23622 +}
23623 +#endif
23624 +
23625  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
23626  {
23627         kmem_cache_free(pool->handle_cachep, (void *)handle);
23628 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
23629
23630  static void record_obj(unsigned long handle, unsigned long obj)
23631  {
23632 +#ifdef CONFIG_PREEMPT_RT_FULL
23633 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23634 +
23635 +       WRITE_ONCE(zh->addr, obj);
23636 +#else
23637         /*
23638          * lsb of @obj represents handle lock while other bits
23639          * represent object value the handle is pointing so
23640          * updating shouldn't do store tearing.
23641          */
23642         WRITE_ONCE(*(unsigned long *)handle, obj);
23643 +#endif
23644  }
23645
23646  /* zpool driver */
23647 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
23648
23649  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
23650  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
23651 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
23652
23653  static bool is_zspage_isolated(struct zspage *zspage)
23654  {
23655 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
23656
23657  static unsigned long handle_to_obj(unsigned long handle)
23658  {
23659 +#ifdef CONFIG_PREEMPT_RT_FULL
23660 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23661 +
23662 +       return zh->addr;
23663 +#else
23664         return *(unsigned long *)handle;
23665 +#endif
23666  }
23667
23668  static unsigned long obj_to_head(struct page *page, void *obj)
23669 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
23670
23671  static inline int testpin_tag(unsigned long handle)
23672  {
23673 +#ifdef CONFIG_PREEMPT_RT_FULL
23674 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23675 +
23676 +       return mutex_is_locked(&zh->lock);
23677 +#else
23678         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
23679 +#endif
23680  }
23681
23682  static inline int trypin_tag(unsigned long handle)
23683  {
23684 +#ifdef CONFIG_PREEMPT_RT_FULL
23685 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23686 +
23687 +       return mutex_trylock(&zh->lock);
23688 +#else
23689         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
23690 +#endif
23691  }
23692
23693  static void pin_tag(unsigned long handle)
23694  {
23695 +#ifdef CONFIG_PREEMPT_RT_FULL
23696 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23697 +
23698 +       return mutex_lock(&zh->lock);
23699 +#else
23700         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
23701 +#endif
23702  }
23703
23704  static void unpin_tag(unsigned long handle)
23705  {
23706 +#ifdef CONFIG_PREEMPT_RT_FULL
23707 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
23708 +
23709 +       return mutex_unlock(&zh->lock);
23710 +#else
23711         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
23712 +#endif
23713  }
23714
23715  static void reset_page(struct page *page)
23716 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
23717         class = pool->size_class[class_idx];
23718         off = (class->size * obj_idx) & ~PAGE_MASK;
23719
23720 -       area = &get_cpu_var(zs_map_area);
23721 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
23722         area->vm_mm = mm;
23723         if (off + class->size <= PAGE_SIZE) {
23724                 /* this object is contained entirely within a page */
23725 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
23726
23727                 __zs_unmap_object(area, pages, off, class->size);
23728         }
23729 -       put_cpu_var(zs_map_area);
23730 +       put_locked_var(zs_map_area_lock, zs_map_area);
23731
23732         migrate_read_unlock(zspage);
23733         unpin_tag(handle);
23734 diff --git a/net/core/dev.c b/net/core/dev.c
23735 index e1d731fdc72c..6ab4b7863755 100644
23736 --- a/net/core/dev.c
23737 +++ b/net/core/dev.c
23738 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
23739  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
23740
23741  static seqcount_t devnet_rename_seq;
23742 +static DEFINE_MUTEX(devnet_rename_mutex);
23743
23744  static inline void dev_base_seq_inc(struct net *net)
23745  {
23746 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
23747  static inline void rps_lock(struct softnet_data *sd)
23748  {
23749  #ifdef CONFIG_RPS
23750 -       spin_lock(&sd->input_pkt_queue.lock);
23751 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
23752  #endif
23753  }
23754
23755  static inline void rps_unlock(struct softnet_data *sd)
23756  {
23757  #ifdef CONFIG_RPS
23758 -       spin_unlock(&sd->input_pkt_queue.lock);
23759 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
23760  #endif
23761  }
23762
23763 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
23764         strcpy(name, dev->name);
23765         rcu_read_unlock();
23766         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
23767 -               cond_resched();
23768 +               mutex_lock(&devnet_rename_mutex);
23769 +               mutex_unlock(&devnet_rename_mutex);
23770                 goto retry;
23771         }
23772
23773 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
23774         if (dev->flags & IFF_UP)
23775                 return -EBUSY;
23776
23777 -       write_seqcount_begin(&devnet_rename_seq);
23778 +       mutex_lock(&devnet_rename_mutex);
23779 +       __raw_write_seqcount_begin(&devnet_rename_seq);
23780
23781 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
23782 -               write_seqcount_end(&devnet_rename_seq);
23783 -               return 0;
23784 -       }
23785 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
23786 +               goto outunlock;
23787
23788         memcpy(oldname, dev->name, IFNAMSIZ);
23789
23790         err = dev_get_valid_name(net, dev, newname);
23791 -       if (err < 0) {
23792 -               write_seqcount_end(&devnet_rename_seq);
23793 -               return err;
23794 -       }
23795 +       if (err < 0)
23796 +               goto outunlock;
23797
23798         if (oldname[0] && !strchr(oldname, '%'))
23799                 netdev_info(dev, "renamed from %s\n", oldname);
23800 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
23801         if (ret) {
23802                 memcpy(dev->name, oldname, IFNAMSIZ);
23803                 dev->name_assign_type = old_assign_type;
23804 -               write_seqcount_end(&devnet_rename_seq);
23805 -               return ret;
23806 +               err = ret;
23807 +               goto outunlock;
23808         }
23809
23810 -       write_seqcount_end(&devnet_rename_seq);
23811 +       __raw_write_seqcount_end(&devnet_rename_seq);
23812 +       mutex_unlock(&devnet_rename_mutex);
23813
23814         netdev_adjacent_rename_links(dev, oldname);
23815
23816 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
23817                 /* err >= 0 after dev_alloc_name() or stores the first errno */
23818                 if (err >= 0) {
23819                         err = ret;
23820 -                       write_seqcount_begin(&devnet_rename_seq);
23821 +                       mutex_lock(&devnet_rename_mutex);
23822 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
23823                         memcpy(dev->name, oldname, IFNAMSIZ);
23824                         memcpy(oldname, newname, IFNAMSIZ);
23825                         dev->name_assign_type = old_assign_type;
23826 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
23827         }
23828
23829         return err;
23830 +
23831 +outunlock:
23832 +       __raw_write_seqcount_end(&devnet_rename_seq);
23833 +       mutex_unlock(&devnet_rename_mutex);
23834 +       return err;
23835  }
23836
23837  /**
23838 @@ -2263,6 +2269,7 @@ static void __netif_reschedule(struct Qdisc *q)
23839         sd->output_queue_tailp = &q->next_sched;
23840         raise_softirq_irqoff(NET_TX_SOFTIRQ);
23841         local_irq_restore(flags);
23842 +       preempt_check_resched_rt();
23843  }
23844
23845  void __netif_schedule(struct Qdisc *q)
23846 @@ -2344,6 +2351,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
23847         __this_cpu_write(softnet_data.completion_queue, skb);
23848         raise_softirq_irqoff(NET_TX_SOFTIRQ);
23849         local_irq_restore(flags);
23850 +       preempt_check_resched_rt();
23851  }
23852  EXPORT_SYMBOL(__dev_kfree_skb_irq);
23853
23854 @@ -3078,7 +3086,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
23855          * This permits qdisc->running owner to get the lock more
23856          * often and dequeue packets faster.
23857          */
23858 +#ifdef CONFIG_PREEMPT_RT_FULL
23859 +       contended = true;
23860 +#else
23861         contended = qdisc_is_running(q);
23862 +#endif
23863         if (unlikely(contended))
23864                 spin_lock(&q->busylock);
23865
23866 @@ -3141,8 +3153,10 @@ static void skb_update_prio(struct sk_buff *skb)
23867  #define skb_update_prio(skb)
23868  #endif
23869
23870 +#ifndef CONFIG_PREEMPT_RT_FULL
23871  DEFINE_PER_CPU(int, xmit_recursion);
23872  EXPORT_SYMBOL(xmit_recursion);
23873 +#endif
23874
23875  /**
23876   *     dev_loopback_xmit - loop back @skb
23877 @@ -3376,8 +3390,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
23878                 int cpu = smp_processor_id(); /* ok because BHs are off */
23879
23880                 if (txq->xmit_lock_owner != cpu) {
23881 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
23882 -                                    XMIT_RECURSION_LIMIT))
23883 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
23884                                 goto recursion_alert;
23885
23886                         skb = validate_xmit_skb(skb, dev);
23887 @@ -3387,9 +3400,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
23888                         HARD_TX_LOCK(dev, txq, cpu);
23889
23890                         if (!netif_xmit_stopped(txq)) {
23891 -                               __this_cpu_inc(xmit_recursion);
23892 +                               xmit_rec_inc();
23893                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
23894 -                               __this_cpu_dec(xmit_recursion);
23895 +                               xmit_rec_dec();
23896                                 if (dev_xmit_complete(rc)) {
23897                                         HARD_TX_UNLOCK(dev, txq);
23898                                         goto out;
23899 @@ -3763,6 +3776,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
23900         rps_unlock(sd);
23901
23902         local_irq_restore(flags);
23903 +       preempt_check_resched_rt();
23904
23905         atomic_long_inc(&skb->dev->rx_dropped);
23906         kfree_skb(skb);
23907 @@ -3781,7 +3795,7 @@ static int netif_rx_internal(struct sk_buff *skb)
23908                 struct rps_dev_flow voidflow, *rflow = &voidflow;
23909                 int cpu;
23910
23911 -               preempt_disable();
23912 +               migrate_disable();
23913                 rcu_read_lock();
23914
23915                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
23916 @@ -3791,13 +3805,13 @@ static int netif_rx_internal(struct sk_buff *skb)
23917                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
23918
23919                 rcu_read_unlock();
23920 -               preempt_enable();
23921 +               migrate_enable();
23922         } else
23923  #endif
23924         {
23925                 unsigned int qtail;
23926 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
23927 -               put_cpu();
23928 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
23929 +               put_cpu_light();
23930         }
23931         return ret;
23932  }
23933 @@ -3831,11 +3845,9 @@ int netif_rx_ni(struct sk_buff *skb)
23934
23935         trace_netif_rx_ni_entry(skb);
23936
23937 -       preempt_disable();
23938 +       local_bh_disable();
23939         err = netif_rx_internal(skb);
23940 -       if (local_softirq_pending())
23941 -               do_softirq();
23942 -       preempt_enable();
23943 +       local_bh_enable();
23944
23945         return err;
23946  }
23947 @@ -4314,7 +4326,7 @@ static void flush_backlog(struct work_struct *work)
23948         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
23949                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23950                         __skb_unlink(skb, &sd->input_pkt_queue);
23951 -                       kfree_skb(skb);
23952 +                       __skb_queue_tail(&sd->tofree_queue, skb);
23953                         input_queue_head_incr(sd);
23954                 }
23955         }
23956 @@ -4324,11 +4336,14 @@ static void flush_backlog(struct work_struct *work)
23957         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
23958                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
23959                         __skb_unlink(skb, &sd->process_queue);
23960 -                       kfree_skb(skb);
23961 +                       __skb_queue_tail(&sd->tofree_queue, skb);
23962                         input_queue_head_incr(sd);
23963                 }
23964         }
23965 +       if (!skb_queue_empty(&sd->tofree_queue))
23966 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
23967         local_bh_enable();
23968 +
23969  }
23970
23971  static void flush_all_backlogs(void)
23972 @@ -4809,6 +4824,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
23973                 sd->rps_ipi_list = NULL;
23974
23975                 local_irq_enable();
23976 +               preempt_check_resched_rt();
23977
23978                 /* Send pending IPI's to kick RPS processing on remote cpus. */
23979                 while (remsd) {
23980 @@ -4822,6 +4838,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
23981         } else
23982  #endif
23983                 local_irq_enable();
23984 +       preempt_check_resched_rt();
23985  }
23986
23987  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
23988 @@ -4851,7 +4868,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
23989         while (again) {
23990                 struct sk_buff *skb;
23991
23992 +               local_irq_disable();
23993                 while ((skb = __skb_dequeue(&sd->process_queue))) {
23994 +                       local_irq_enable();
23995                         rcu_read_lock();
23996                         __netif_receive_skb(skb);
23997                         rcu_read_unlock();
23998 @@ -4859,9 +4878,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
23999                         if (++work >= quota)
24000                                 return work;
24001
24002 +                       local_irq_disable();
24003                 }
24004
24005 -               local_irq_disable();
24006                 rps_lock(sd);
24007                 if (skb_queue_empty(&sd->input_pkt_queue)) {
24008                         /*
24009 @@ -4899,9 +4918,11 @@ void __napi_schedule(struct napi_struct *n)
24010         local_irq_save(flags);
24011         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
24012         local_irq_restore(flags);
24013 +       preempt_check_resched_rt();
24014  }
24015  EXPORT_SYMBOL(__napi_schedule);
24016
24017 +#ifndef CONFIG_PREEMPT_RT_FULL
24018  /**
24019   * __napi_schedule_irqoff - schedule for receive
24020   * @n: entry to schedule
24021 @@ -4913,6 +4934,7 @@ void __napi_schedule_irqoff(struct napi_struct *n)
24022         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
24023  }
24024  EXPORT_SYMBOL(__napi_schedule_irqoff);
24025 +#endif
24026
24027  void __napi_complete(struct napi_struct *n)
24028  {
24029 @@ -5202,13 +5224,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
24030         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
24031         unsigned long time_limit = jiffies + 2;
24032         int budget = netdev_budget;
24033 +       struct sk_buff_head tofree_q;
24034 +       struct sk_buff *skb;
24035         LIST_HEAD(list);
24036         LIST_HEAD(repoll);
24037
24038 +       __skb_queue_head_init(&tofree_q);
24039 +
24040         local_irq_disable();
24041 +       skb_queue_splice_init(&sd->tofree_queue, &tofree_q);
24042         list_splice_init(&sd->poll_list, &list);
24043         local_irq_enable();
24044
24045 +       while ((skb = __skb_dequeue(&tofree_q)))
24046 +               kfree_skb(skb);
24047 +
24048         for (;;) {
24049                 struct napi_struct *n;
24050
24051 @@ -5239,7 +5269,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
24052         list_splice_tail(&repoll, &list);
24053         list_splice(&list, &sd->poll_list);
24054         if (!list_empty(&sd->poll_list))
24055 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
24056 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
24057
24058         net_rps_action_and_irq_enable(sd);
24059  }
24060 @@ -8000,16 +8030,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
24061
24062         raise_softirq_irqoff(NET_TX_SOFTIRQ);
24063         local_irq_enable();
24064 +       preempt_check_resched_rt();
24065
24066         /* Process offline CPU's input_pkt_queue */
24067         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
24068                 netif_rx_ni(skb);
24069                 input_queue_head_incr(oldsd);
24070         }
24071 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
24072 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
24073                 netif_rx_ni(skb);
24074                 input_queue_head_incr(oldsd);
24075         }
24076 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
24077 +               kfree_skb(skb);
24078 +       }
24079
24080         return NOTIFY_OK;
24081  }
24082 @@ -8314,8 +8348,9 @@ static int __init net_dev_init(void)
24083
24084                 INIT_WORK(flush, flush_backlog);
24085
24086 -               skb_queue_head_init(&sd->input_pkt_queue);
24087 -               skb_queue_head_init(&sd->process_queue);
24088 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
24089 +               skb_queue_head_init_raw(&sd->process_queue);
24090 +               skb_queue_head_init_raw(&sd->tofree_queue);
24091                 INIT_LIST_HEAD(&sd->poll_list);
24092                 sd->output_queue_tailp = &sd->output_queue;
24093  #ifdef CONFIG_RPS
24094 diff --git a/net/core/filter.c b/net/core/filter.c
24095 index b391209838ef..b86e9681a88e 100644
24096 --- a/net/core/filter.c
24097 +++ b/net/core/filter.c
24098 @@ -1645,7 +1645,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
24099  {
24100         int ret;
24101
24102 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
24103 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
24104                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
24105                 kfree_skb(skb);
24106                 return -ENETDOWN;
24107 @@ -1653,9 +1653,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
24108
24109         skb->dev = dev;
24110
24111 -       __this_cpu_inc(xmit_recursion);
24112 +       xmit_rec_inc();
24113         ret = dev_queue_xmit(skb);
24114 -       __this_cpu_dec(xmit_recursion);
24115 +       xmit_rec_dec();
24116
24117         return ret;
24118  }
24119 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
24120 index cad8e791f28e..2a9364fe62a5 100644
24121 --- a/net/core/gen_estimator.c
24122 +++ b/net/core/gen_estimator.c
24123 @@ -84,7 +84,7 @@ struct gen_estimator
24124         struct gnet_stats_basic_packed  *bstats;
24125         struct gnet_stats_rate_est64    *rate_est;
24126         spinlock_t              *stats_lock;
24127 -       seqcount_t              *running;
24128 +       net_seqlock_t           *running;
24129         int                     ewma_log;
24130         u32                     last_packets;
24131         unsigned long           avpps;
24132 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
24133                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
24134                       struct gnet_stats_rate_est64 *rate_est,
24135                       spinlock_t *stats_lock,
24136 -                     seqcount_t *running,
24137 +                     net_seqlock_t *running,
24138                       struct nlattr *opt)
24139  {
24140         struct gen_estimator *est;
24141 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
24142                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
24143                           struct gnet_stats_rate_est64 *rate_est,
24144                           spinlock_t *stats_lock,
24145 -                         seqcount_t *running, struct nlattr *opt)
24146 +                         net_seqlock_t *running, struct nlattr *opt)
24147  {
24148         gen_kill_estimator(bstats, rate_est);
24149         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
24150 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
24151 index 508e051304fb..bc3b17b78c94 100644
24152 --- a/net/core/gen_stats.c
24153 +++ b/net/core/gen_stats.c
24154 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
24155  }
24156
24157  void
24158 -__gnet_stats_copy_basic(const seqcount_t *running,
24159 +__gnet_stats_copy_basic(net_seqlock_t *running,
24160                         struct gnet_stats_basic_packed *bstats,
24161                         struct gnet_stats_basic_cpu __percpu *cpu,
24162                         struct gnet_stats_basic_packed *b)
24163 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
24164         }
24165         do {
24166                 if (running)
24167 -                       seq = read_seqcount_begin(running);
24168 +                       seq = net_seq_begin(running);
24169                 bstats->bytes = b->bytes;
24170                 bstats->packets = b->packets;
24171 -       } while (running && read_seqcount_retry(running, seq));
24172 +       } while (running && net_seq_retry(running, seq));
24173  }
24174  EXPORT_SYMBOL(__gnet_stats_copy_basic);
24175
24176 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
24177   * if the room in the socket buffer was not sufficient.
24178   */
24179  int
24180 -gnet_stats_copy_basic(const seqcount_t *running,
24181 +gnet_stats_copy_basic(net_seqlock_t *running,
24182                       struct gnet_dump *d,
24183                       struct gnet_stats_basic_cpu __percpu *cpu,
24184                       struct gnet_stats_basic_packed *b)
24185 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
24186 index 1e3e0087245b..1077b39db717 100644
24187 --- a/net/core/skbuff.c
24188 +++ b/net/core/skbuff.c
24189 @@ -64,6 +64,7 @@
24190  #include <linux/errqueue.h>
24191  #include <linux/prefetch.h>
24192  #include <linux/if_vlan.h>
24193 +#include <linux/locallock.h>
24194
24195  #include <net/protocol.h>
24196  #include <net/dst.h>
24197 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
24198
24199  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
24200  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
24201 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
24202 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
24203
24204  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
24205  {
24206 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
24207         unsigned long flags;
24208         void *data;
24209
24210 -       local_irq_save(flags);
24211 +       local_lock_irqsave(netdev_alloc_lock, flags);
24212         nc = this_cpu_ptr(&netdev_alloc_cache);
24213         data = __alloc_page_frag(nc, fragsz, gfp_mask);
24214 -       local_irq_restore(flags);
24215 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
24216         return data;
24217  }
24218
24219 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
24220
24221  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
24222  {
24223 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
24224 +       struct napi_alloc_cache *nc;
24225 +       void *data;
24226
24227 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
24228 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24229 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
24230 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24231 +       return data;
24232  }
24233
24234  void *napi_alloc_frag(unsigned int fragsz)
24235 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
24236         if (sk_memalloc_socks())
24237                 gfp_mask |= __GFP_MEMALLOC;
24238
24239 -       local_irq_save(flags);
24240 +       local_lock_irqsave(netdev_alloc_lock, flags);
24241
24242         nc = this_cpu_ptr(&netdev_alloc_cache);
24243         data = __alloc_page_frag(nc, len, gfp_mask);
24244         pfmemalloc = nc->pfmemalloc;
24245
24246 -       local_irq_restore(flags);
24247 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
24248
24249         if (unlikely(!data))
24250                 return NULL;
24251 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
24252  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
24253                                  gfp_t gfp_mask)
24254  {
24255 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
24256 +       struct napi_alloc_cache *nc;
24257         struct sk_buff *skb;
24258         void *data;
24259 +       bool pfmemalloc;
24260
24261         len += NET_SKB_PAD + NET_IP_ALIGN;
24262
24263 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
24264         if (sk_memalloc_socks())
24265                 gfp_mask |= __GFP_MEMALLOC;
24266
24267 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24268         data = __alloc_page_frag(&nc->page, len, gfp_mask);
24269 +       pfmemalloc = nc->page.pfmemalloc;
24270 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24271         if (unlikely(!data))
24272                 return NULL;
24273
24274 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
24275         }
24276
24277         /* use OR instead of assignment to avoid clearing of bits in mask */
24278 -       if (nc->page.pfmemalloc)
24279 +       if (pfmemalloc)
24280                 skb->pfmemalloc = 1;
24281         skb->head_frag = 1;
24282
24283 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
24284
24285  void __kfree_skb_flush(void)
24286  {
24287 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
24288 +       struct napi_alloc_cache *nc;
24289
24290 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24291         /* flush skb_cache if containing objects */
24292         if (nc->skb_count) {
24293                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
24294                                      nc->skb_cache);
24295                 nc->skb_count = 0;
24296         }
24297 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24298  }
24299
24300  static inline void _kfree_skb_defer(struct sk_buff *skb)
24301  {
24302 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
24303 +       struct napi_alloc_cache *nc;
24304
24305         /* drop skb->head and call any destructors for packet */
24306         skb_release_all(skb);
24307
24308 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24309         /* record skb to CPU local list */
24310         nc->skb_cache[nc->skb_count++] = skb;
24311
24312 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
24313                                      nc->skb_cache);
24314                 nc->skb_count = 0;
24315         }
24316 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
24317  }
24318  void __kfree_skb_defer(struct sk_buff *skb)
24319  {
24320 diff --git a/net/core/sock.c b/net/core/sock.c
24321 index bc6543f7de36..2c32ee79620f 100644
24322 --- a/net/core/sock.c
24323 +++ b/net/core/sock.c
24324 @@ -2488,12 +2488,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
24325         if (sk->sk_lock.owned)
24326                 __lock_sock(sk);
24327         sk->sk_lock.owned = 1;
24328 -       spin_unlock(&sk->sk_lock.slock);
24329 +       spin_unlock_bh(&sk->sk_lock.slock);
24330         /*
24331          * The sk_lock has mutex_lock() semantics here:
24332          */
24333         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
24334 -       local_bh_enable();
24335  }
24336  EXPORT_SYMBOL(lock_sock_nested);
24337
24338 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
24339 index 48734ee6293f..e6864ff11352 100644
24340 --- a/net/ipv4/icmp.c
24341 +++ b/net/ipv4/icmp.c
24342 @@ -69,6 +69,7 @@
24343  #include <linux/jiffies.h>
24344  #include <linux/kernel.h>
24345  #include <linux/fcntl.h>
24346 +#include <linux/sysrq.h>
24347  #include <linux/socket.h>
24348  #include <linux/in.h>
24349  #include <linux/inet.h>
24350 @@ -77,6 +78,7 @@
24351  #include <linux/string.h>
24352  #include <linux/netfilter_ipv4.h>
24353  #include <linux/slab.h>
24354 +#include <linux/locallock.h>
24355  #include <net/snmp.h>
24356  #include <net/ip.h>
24357  #include <net/route.h>
24358 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
24359   *
24360   *     On SMP we have one ICMP socket per-cpu.
24361   */
24362 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
24363 +
24364  static struct sock *icmp_sk(struct net *net)
24365  {
24366         return *this_cpu_ptr(net->ipv4.icmp_sk);
24367 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
24368
24369         local_bh_disable();
24370
24371 +       local_lock(icmp_sk_lock);
24372         sk = icmp_sk(net);
24373
24374         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
24375                 /* This can happen if the output path signals a
24376                  * dst_link_failure() for an outgoing ICMP packet.
24377                  */
24378 +               local_unlock(icmp_sk_lock);
24379                 local_bh_enable();
24380                 return NULL;
24381         }
24382 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
24383  static inline void icmp_xmit_unlock(struct sock *sk)
24384  {
24385         spin_unlock_bh(&sk->sk_lock.slock);
24386 +       local_unlock(icmp_sk_lock);
24387  }
24388
24389  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
24390 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
24391         struct sock *sk;
24392         struct sk_buff *skb;
24393
24394 +       local_lock(icmp_sk_lock);
24395         sk = icmp_sk(dev_net((*rt)->dst.dev));
24396         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
24397                            icmp_param->data_len+icmp_param->head_len,
24398 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
24399                 skb->ip_summed = CHECKSUM_NONE;
24400                 ip_push_pending_frames(sk, fl4);
24401         }
24402 +       local_unlock(icmp_sk_lock);
24403  }
24404
24405  /*
24406 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
24407  }
24408
24409  /*
24410 + * 32bit and 64bit have different timestamp length, so we check for
24411 + * the cookie at offset 20 and verify it is repeated at offset 50
24412 + */
24413 +#define CO_POS0                20
24414 +#define CO_POS1                50
24415 +#define CO_SIZE                sizeof(int)
24416 +#define ICMP_SYSRQ_SIZE        57
24417 +
24418 +/*
24419 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
24420 + * pattern and if it matches send the next byte as a trigger to sysrq.
24421 + */
24422 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
24423 +{
24424 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
24425 +       char *p = skb->data;
24426 +
24427 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
24428 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
24429 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
24430 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
24431 +}
24432 +
24433 +/*
24434   *     Handle ICMP_ECHO ("ping") requests.
24435   *
24436   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
24437 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
24438                 icmp_param.data_len        = skb->len;
24439                 icmp_param.head_len        = sizeof(struct icmphdr);
24440                 icmp_reply(&icmp_param, skb);
24441 +
24442 +               if (skb->len == ICMP_SYSRQ_SIZE &&
24443 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
24444 +                       icmp_check_sysrq(net, skb);
24445 +               }
24446         }
24447         /* should there be an ICMP stat for ignored echos? */
24448         return true;
24449 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
24450 index 80bc36b25de2..215b90adfb05 100644
24451 --- a/net/ipv4/sysctl_net_ipv4.c
24452 +++ b/net/ipv4/sysctl_net_ipv4.c
24453 @@ -681,6 +681,13 @@ static struct ctl_table ipv4_net_table[] = {
24454                 .proc_handler   = proc_dointvec
24455         },
24456         {
24457 +               .procname       = "icmp_echo_sysrq",
24458 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
24459 +               .maxlen         = sizeof(int),
24460 +               .mode           = 0644,
24461 +               .proc_handler   = proc_dointvec
24462 +       },
24463 +       {
24464                 .procname       = "icmp_ignore_bogus_error_responses",
24465                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
24466                 .maxlen         = sizeof(int),
24467 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
24468 index 2259114c7242..829e60985a81 100644
24469 --- a/net/ipv4/tcp_ipv4.c
24470 +++ b/net/ipv4/tcp_ipv4.c
24471 @@ -62,6 +62,7 @@
24472  #include <linux/init.h>
24473  #include <linux/times.h>
24474  #include <linux/slab.h>
24475 +#include <linux/locallock.h>
24476
24477  #include <net/net_namespace.h>
24478  #include <net/icmp.h>
24479 @@ -564,6 +565,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
24480  }
24481  EXPORT_SYMBOL(tcp_v4_send_check);
24482
24483 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
24484  /*
24485   *     This routine will send an RST to the other tcp.
24486   *
24487 @@ -691,6 +693,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
24488                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
24489
24490         arg.tos = ip_hdr(skb)->tos;
24491 +
24492 +       local_lock(tcp_sk_lock);
24493         local_bh_disable();
24494         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
24495                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
24496 @@ -700,6 +704,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
24497         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
24498         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
24499         local_bh_enable();
24500 +       local_unlock(tcp_sk_lock);
24501
24502  #ifdef CONFIG_TCP_MD5SIG
24503  out:
24504 @@ -775,6 +780,7 @@ static void tcp_v4_send_ack(struct net *net,
24505         if (oif)
24506                 arg.bound_dev_if = oif;
24507         arg.tos = tos;
24508 +       local_lock(tcp_sk_lock);
24509         local_bh_disable();
24510         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
24511                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
24512 @@ -783,6 +789,7 @@ static void tcp_v4_send_ack(struct net *net,
24513
24514         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
24515         local_bh_enable();
24516 +       local_unlock(tcp_sk_lock);
24517  }
24518
24519  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
24520 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
24521 index 2384b4aae064..bf7ab51d7035 100644
24522 --- a/net/mac80211/rx.c
24523 +++ b/net/mac80211/rx.c
24524 @@ -4166,7 +4166,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
24525         struct ieee80211_supported_band *sband;
24526         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
24527
24528 -       WARN_ON_ONCE(softirq_count() == 0);
24529 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
24530
24531         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
24532                 goto drop;
24533 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
24534 index 004af030ef1a..b64f751bda45 100644
24535 --- a/net/netfilter/core.c
24536 +++ b/net/netfilter/core.c
24537 @@ -22,12 +22,18 @@
24538  #include <linux/proc_fs.h>
24539  #include <linux/mutex.h>
24540  #include <linux/slab.h>
24541 +#include <linux/locallock.h>
24542  #include <linux/rcupdate.h>
24543  #include <net/net_namespace.h>
24544  #include <net/sock.h>
24545
24546  #include "nf_internals.h"
24547
24548 +#ifdef CONFIG_PREEMPT_RT_BASE
24549 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
24550 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
24551 +#endif
24552 +
24553  static DEFINE_MUTEX(afinfo_mutex);
24554
24555  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
24556 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
24557 index dd2332390c45..f6a703b25b6c 100644
24558 --- a/net/packet/af_packet.c
24559 +++ b/net/packet/af_packet.c
24560 @@ -63,6 +63,7 @@
24561  #include <linux/if_packet.h>
24562  #include <linux/wireless.h>
24563  #include <linux/kernel.h>
24564 +#include <linux/delay.h>
24565  #include <linux/kmod.h>
24566  #include <linux/slab.h>
24567  #include <linux/vmalloc.h>
24568 @@ -694,7 +695,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
24569         if (BLOCK_NUM_PKTS(pbd)) {
24570                 while (atomic_read(&pkc->blk_fill_in_prog)) {
24571                         /* Waiting for skb_copy_bits to finish... */
24572 -                       cpu_relax();
24573 +                       cpu_chill();
24574                 }
24575         }
24576
24577 @@ -956,7 +957,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
24578                 if (!(status & TP_STATUS_BLK_TMO)) {
24579                         while (atomic_read(&pkc->blk_fill_in_prog)) {
24580                                 /* Waiting for skb_copy_bits to finish... */
24581 -                               cpu_relax();
24582 +                               cpu_chill();
24583                         }
24584                 }
24585                 prb_close_block(pkc, pbd, po, status);
24586 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
24587 index 977f69886c00..f3e7a36b0396 100644
24588 --- a/net/rds/ib_rdma.c
24589 +++ b/net/rds/ib_rdma.c
24590 @@ -34,6 +34,7 @@
24591  #include <linux/slab.h>
24592  #include <linux/rculist.h>
24593  #include <linux/llist.h>
24594 +#include <linux/delay.h>
24595
24596  #include "rds_single_path.h"
24597  #include "ib_mr.h"
24598 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
24599         for_each_online_cpu(cpu) {
24600                 flag = &per_cpu(clean_list_grace, cpu);
24601                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
24602 -                       cpu_relax();
24603 +                       cpu_chill();
24604         }
24605  }
24606
24607 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
24608 index 7d921e56e715..13df56a738e5 100644
24609 --- a/net/rxrpc/security.c
24610 +++ b/net/rxrpc/security.c
24611 @@ -19,9 +19,6 @@
24612  #include <keys/rxrpc-type.h>
24613  #include "ar-internal.h"
24614
24615 -static LIST_HEAD(rxrpc_security_methods);
24616 -static DECLARE_RWSEM(rxrpc_security_sem);
24617 -
24618  static const struct rxrpc_security *rxrpc_security_types[] = {
24619         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
24620  #ifdef CONFIG_RXKAD
24621 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
24622 index 206dc24add3a..00ea9bde5bb3 100644
24623 --- a/net/sched/sch_api.c
24624 +++ b/net/sched/sch_api.c
24625 @@ -981,7 +981,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
24626                         rcu_assign_pointer(sch->stab, stab);
24627                 }
24628                 if (tca[TCA_RATE]) {
24629 -                       seqcount_t *running;
24630 +                       net_seqlock_t *running;
24631
24632                         err = -EOPNOTSUPP;
24633                         if (sch->flags & TCQ_F_MQROOT)
24634 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
24635 index 6cfb6e9038c2..20727e1347de 100644
24636 --- a/net/sched/sch_generic.c
24637 +++ b/net/sched/sch_generic.c
24638 @@ -425,7 +425,11 @@ struct Qdisc noop_qdisc = {
24639         .ops            =       &noop_qdisc_ops,
24640         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
24641         .dev_queue      =       &noop_netdev_queue,
24642 +#ifdef CONFIG_PREEMPT_RT_BASE
24643 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
24644 +#else
24645         .running        =       SEQCNT_ZERO(noop_qdisc.running),
24646 +#endif
24647         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
24648  };
24649  EXPORT_SYMBOL(noop_qdisc);
24650 @@ -624,9 +628,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
24651         lockdep_set_class(&sch->busylock,
24652                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
24653
24654 +#ifdef CONFIG_PREEMPT_RT_BASE
24655 +       seqlock_init(&sch->running);
24656 +       lockdep_set_class(&sch->running.seqcount,
24657 +                         dev->qdisc_running_key ?: &qdisc_running_key);
24658 +       lockdep_set_class(&sch->running.lock,
24659 +                         dev->qdisc_running_key ?: &qdisc_running_key);
24660 +#else
24661         seqcount_init(&sch->running);
24662         lockdep_set_class(&sch->running,
24663                           dev->qdisc_running_key ?: &qdisc_running_key);
24664 +#endif
24665
24666         sch->ops = ops;
24667         sch->enqueue = ops->enqueue;
24668 @@ -925,7 +937,7 @@ void dev_deactivate_many(struct list_head *head)
24669         /* Wait for outstanding qdisc_run calls. */
24670         list_for_each_entry(dev, head, close_list)
24671                 while (some_qdisc_is_busy(dev))
24672 -                       yield();
24673 +                       msleep(1);
24674  }
24675
24676  void dev_deactivate(struct net_device *dev)
24677 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
24678 index 9c9db55a0c1e..e6583b018a72 100644
24679 --- a/net/sunrpc/svc_xprt.c
24680 +++ b/net/sunrpc/svc_xprt.c
24681 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
24682                 goto out;
24683         }
24684
24685 -       cpu = get_cpu();
24686 +       cpu = get_cpu_light();
24687         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
24688
24689         atomic_long_inc(&pool->sp_stats.packets);
24690 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
24691
24692                 atomic_long_inc(&pool->sp_stats.threads_woken);
24693                 wake_up_process(rqstp->rq_task);
24694 -               put_cpu();
24695 +               put_cpu_light();
24696                 goto out;
24697         }
24698         rcu_read_unlock();
24699 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
24700                 goto redo_search;
24701         }
24702         rqstp = NULL;
24703 -       put_cpu();
24704 +       put_cpu_light();
24705  out:
24706         trace_svc_xprt_do_enqueue(xprt, rqstp);
24707  }
24708 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
24709 index 6fdc97ef6023..523e0420d7f0 100755
24710 --- a/scripts/mkcompile_h
24711 +++ b/scripts/mkcompile_h
24712 @@ -4,7 +4,8 @@ TARGET=$1
24713  ARCH=$2
24714  SMP=$3
24715  PREEMPT=$4
24716 -CC=$5
24717 +RT=$5
24718 +CC=$6
24719
24720  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
24721
24722 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
24723  CONFIG_FLAGS=""
24724  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
24725  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
24726 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
24727  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
24728
24729  # Truncate to maximum length
24730 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
24731 index 9d33c1e85c79..3d307bda86f9 100644
24732 --- a/sound/core/pcm_native.c
24733 +++ b/sound/core/pcm_native.c
24734 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
24735  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
24736  {
24737         if (!substream->pcm->nonatomic)
24738 -               local_irq_disable();
24739 +               local_irq_disable_nort();
24740         snd_pcm_stream_lock(substream);
24741  }
24742  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
24743 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
24744  {
24745         snd_pcm_stream_unlock(substream);
24746         if (!substream->pcm->nonatomic)
24747 -               local_irq_enable();
24748 +               local_irq_enable_nort();
24749  }
24750  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
24751
24752 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
24753  {
24754         unsigned long flags = 0;
24755         if (!substream->pcm->nonatomic)
24756 -               local_irq_save(flags);
24757 +               local_irq_save_nort(flags);
24758         snd_pcm_stream_lock(substream);
24759         return flags;
24760  }
24761 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
24762  {
24763         snd_pcm_stream_unlock(substream);
24764         if (!substream->pcm->nonatomic)
24765 -               local_irq_restore(flags);
24766 +               local_irq_restore_nort(flags);
24767  }
24768  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
24769