kernel-rt.patch

   1 diff --git a/Documentation/hwlat_detector.txt b/Documentation/hwlat_detector.txt
   2 new file mode 100644
   3 index 000000000000..cb61516483d3
   4 --- /dev/null
   5 +++ b/Documentation/hwlat_detector.txt
   6 @@ -0,0 +1,64 @@
   7 +Introduction:
   8 +-------------
   9 +
  10 +The module hwlat_detector is a special purpose kernel module that is used to
  11 +detect large system latencies induced by the behavior of certain underlying
  12 +hardware or firmware, independent of Linux itself. The code was developed
  13 +originally to detect SMIs (System Management Interrupts) on x86 systems,
  14 +however there is nothing x86 specific about this patchset. It was
  15 +originally written for use by the "RT" patch since the Real Time
  16 +kernel is highly latency sensitive.
  17 +
  18 +SMIs are usually not serviced by the Linux kernel, which typically does not
  19 +even know that they are occuring. SMIs are instead are set up by BIOS code
  20 +and are serviced by BIOS code, usually for "critical" events such as
  21 +management of thermal sensors and fans. Sometimes though, SMIs are used for
  22 +other tasks and those tasks can spend an inordinate amount of time in the
  23 +handler (sometimes measured in milliseconds). Obviously this is a problem if
  24 +you are trying to keep event service latencies down in the microsecond range.
  25 +
  26 +The hardware latency detector works by hogging all of the cpus for configurable
  27 +amounts of time (by calling stop_machine()), polling the CPU Time Stamp Counter
  28 +for some period, then looking for gaps in the TSC data. Any gap indicates a
  29 +time when the polling was interrupted and since the machine is stopped and
  30 +interrupts turned off the only thing that could do that would be an SMI.
  31 +
  32 +Note that the SMI detector should *NEVER* be used in a production environment.
  33 +It is intended to be run manually to determine if the hardware platform has a
  34 +problem with long system firmware service routines.
  35 +
  36 +Usage:
  37 +------
  38 +
  39 +Loading the module hwlat_detector passing the parameter "enabled=1" (or by
  40 +setting the "enable" entry in "hwlat_detector" debugfs toggled on) is the only
  41 +step required to start the hwlat_detector. It is possible to redefine the
  42 +threshold in microseconds (us) above which latency spikes will be taken
  43 +into account (parameter "threshold=").
  44 +
  45 +Example:
  46 +
  47 +       # modprobe hwlat_detector enabled=1 threshold=100
  48 +
  49 +After the module is loaded, it creates a directory named "hwlat_detector" under
  50 +the debugfs mountpoint, "/debug/hwlat_detector" for this text. It is necessary
  51 +to have debugfs mounted, which might be on /sys/debug on your system.
  52 +
  53 +The /debug/hwlat_detector interface contains the following files:
  54 +
  55 +count                  - number of latency spikes observed since last reset
  56 +enable                 - a global enable/disable toggle (0/1), resets count
  57 +max                    - maximum hardware latency actually observed (usecs)
  58 +sample                 - a pipe from which to read current raw sample data
  59 +                         in the format <timestamp> <latency observed usecs>
  60 +                         (can be opened O_NONBLOCK for a single sample)
  61 +threshold              - minimum latency value to be considered (usecs)
  62 +width                  - time period to sample with CPUs held (usecs)
  63 +                         must be less than the total window size (enforced)
  64 +window                 - total period of sampling, width being inside (usecs)
  65 +
  66 +By default we will set width to 500,000 and window to 1,000,000, meaning that
  67 +we will sample every 1,000,000 usecs (1s) for 500,000 usecs (0.5s). If we
  68 +observe any latencies that exceed the threshold (initially 100 usecs),
  69 +then we write to a global sample ring buffer of 8K samples, which is
  70 +consumed by reading from the "sample" (pipe) debugfs file interface.
  71 diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
  72 index 3a3b30ac2a75..9e0745cafbd8 100644
  73 --- a/Documentation/sysrq.txt
  74 +++ b/Documentation/sysrq.txt
  75 @@ -59,10 +59,17 @@ On PowerPC - Press 'ALT - Print Screen (or F13) - <command key>,
  76  On other - If you know of the key combos for other architectures, please
  77             let me know so I can add them to this section.
  78
  79 -On all -  write a character to /proc/sysrq-trigger.  e.g.:
  80 -
  81 +On all -  write a character to /proc/sysrq-trigger, e.g.:
  82                 echo t > /proc/sysrq-trigger
  83
  84 +On all - Enable network SysRq by writing a cookie to icmp_echo_sysrq, e.g.
  85 +               echo 0x01020304 >/proc/sys/net/ipv4/icmp_echo_sysrq
  86 +        Send an ICMP echo request with this pattern plus the particular
  87 +        SysRq command key. Example:
  88 +               # ping -c1 -s57 -p0102030468
  89 +        will trigger the SysRq-H (help) command.
  90 +
  91 +
  92  *  What are the 'command' keys?
  93  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  94  'b'     - Will immediately reboot the system without syncing or unmounting
  95 diff --git a/Documentation/trace/histograms.txt b/Documentation/trace/histograms.txt
  96 new file mode 100644
  97 index 000000000000..6f2aeabf7faa
  98 --- /dev/null
  99 +++ b/Documentation/trace/histograms.txt
 100 @@ -0,0 +1,186 @@
 101 +               Using the Linux Kernel Latency Histograms
 102 +
 103 +
 104 +This document gives a short explanation how to enable, configure and use
 105 +latency histograms. Latency histograms are primarily relevant in the
 106 +context of real-time enabled kernels (CONFIG_PREEMPT/CONFIG_PREEMPT_RT)
 107 +and are used in the quality management of the Linux real-time
 108 +capabilities.
 109 +
 110 +
 111 +* Purpose of latency histograms
 112 +
 113 +A latency histogram continuously accumulates the frequencies of latency
 114 +data. There are two types of histograms
 115 +- potential sources of latencies
 116 +- effective latencies
 117 +
 118 +
 119 +* Potential sources of latencies
 120 +
 121 +Potential sources of latencies are code segments where interrupts,
 122 +preemption or both are disabled (aka critical sections). To create
 123 +histograms of potential sources of latency, the kernel stores the time
 124 +stamp at the start of a critical section, determines the time elapsed
 125 +when the end of the section is reached, and increments the frequency
 126 +counter of that latency value - irrespective of whether any concurrently
 127 +running process is affected by latency or not.
 128 +- Configuration items (in the Kernel hacking/Tracers submenu)
 129 +  CONFIG_INTERRUPT_OFF_LATENCY
 130 +  CONFIG_PREEMPT_OFF_LATENCY
 131 +
 132 +
 133 +* Effective latencies
 134 +
 135 +Effective latencies are actually occuring during wakeup of a process. To
 136 +determine effective latencies, the kernel stores the time stamp when a
 137 +process is scheduled to be woken up, and determines the duration of the
 138 +wakeup time shortly before control is passed over to this process. Note
 139 +that the apparent latency in user space may be somewhat longer, since the
 140 +process may be interrupted after control is passed over to it but before
 141 +the execution in user space takes place. Simply measuring the interval
 142 +between enqueuing and wakeup may also not appropriate in cases when a
 143 +process is scheduled as a result of a timer expiration. The timer may have
 144 +missed its deadline, e.g. due to disabled interrupts, but this latency
 145 +would not be registered. Therefore, the offsets of missed timers are
 146 +recorded in a separate histogram. If both wakeup latency and missed timer
 147 +offsets are configured and enabled, a third histogram may be enabled that
 148 +records the overall latency as a sum of the timer latency, if any, and the
 149 +wakeup latency. This histogram is called "timerandwakeup".
 150 +- Configuration items (in the Kernel hacking/Tracers submenu)
 151 +  CONFIG_WAKEUP_LATENCY
 152 +  CONFIG_MISSED_TIMER_OFSETS
 153 +
 154 +
 155 +* Usage
 156 +
 157 +The interface to the administration of the latency histograms is located
 158 +in the debugfs file system. To mount it, either enter
 159 +
 160 +mount -t sysfs nodev /sys
 161 +mount -t debugfs nodev /sys/kernel/debug
 162 +
 163 +from shell command line level, or add
 164 +
 165 +nodev  /sys                    sysfs   defaults        0 0
 166 +nodev  /sys/kernel/debug       debugfs defaults        0 0
 167 +
 168 +to the file /etc/fstab. All latency histogram related files are then
 169 +available in the directory /sys/kernel/debug/tracing/latency_hist. A
 170 +particular histogram type is enabled by writing non-zero to the related
 171 +variable in the /sys/kernel/debug/tracing/latency_hist/enable directory.
 172 +Select "preemptirqsoff" for the histograms of potential sources of
 173 +latencies and "wakeup" for histograms of effective latencies etc. The
 174 +histogram data - one per CPU - are available in the files
 175 +
 176 +/sys/kernel/debug/tracing/latency_hist/preemptoff/CPUx
 177 +/sys/kernel/debug/tracing/latency_hist/irqsoff/CPUx
 178 +/sys/kernel/debug/tracing/latency_hist/preemptirqsoff/CPUx
 179 +/sys/kernel/debug/tracing/latency_hist/wakeup/CPUx
 180 +/sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio/CPUx
 181 +/sys/kernel/debug/tracing/latency_hist/missed_timer_offsets/CPUx
 182 +/sys/kernel/debug/tracing/latency_hist/timerandwakeup/CPUx
 183 +
 184 +The histograms are reset by writing non-zero to the file "reset" in a
 185 +particular latency directory. To reset all latency data, use
 186 +
 187 +#!/bin/sh
 188 +
 189 +TRACINGDIR=/sys/kernel/debug/tracing
 190 +HISTDIR=$TRACINGDIR/latency_hist
 191 +
 192 +if test -d $HISTDIR
 193 +then
 194 +  cd $HISTDIR
 195 +  for i in `find . | grep /reset$`
 196 +  do
 197 +    echo 1 >$i
 198 +  done
 199 +fi
 200 +
 201 +
 202 +* Data format
 203 +
 204 +Latency data are stored with a resolution of one microsecond. The
 205 +maximum latency is 10,240 microseconds. The data are only valid, if the
 206 +overflow register is empty. Every output line contains the latency in
 207 +microseconds in the first row and the number of samples in the second
 208 +row. To display only lines with a positive latency count, use, for
 209 +example,
 210 +
 211 +grep -v " 0$" /sys/kernel/debug/tracing/latency_hist/preemptoff/CPU0
 212 +
 213 +#Minimum latency: 0 microseconds.
 214 +#Average latency: 0 microseconds.
 215 +#Maximum latency: 25 microseconds.
 216 +#Total samples: 3104770694
 217 +#There are 0 samples greater or equal than 10240 microseconds
 218 +#usecs          samples
 219 +    0        2984486876
 220 +    1          49843506
 221 +    2          58219047
 222 +    3           5348126
 223 +    4           2187960
 224 +    5           3388262
 225 +    6            959289
 226 +    7            208294
 227 +    8             40420
 228 +    9              4485
 229 +   10             14918
 230 +   11             18340
 231 +   12             25052
 232 +   13             19455
 233 +   14              5602
 234 +   15               969
 235 +   16                47
 236 +   17                18
 237 +   18                14
 238 +   19                 1
 239 +   20                 3
 240 +   21                 2
 241 +   22                 5
 242 +   23                 2
 243 +   25                 1
 244 +
 245 +
 246 +* Wakeup latency of a selected process
 247 +
 248 +To only collect wakeup latency data of a particular process, write the
 249 +PID of the requested process to
 250 +
 251 +/sys/kernel/debug/tracing/latency_hist/wakeup/pid
 252 +
 253 +PIDs are not considered, if this variable is set to 0.
 254 +
 255 +
 256 +* Details of the process with the highest wakeup latency so far
 257 +
 258 +Selected data of the process that suffered from the highest wakeup
 259 +latency that occurred in a particular CPU are available in the file
 260 +
 261 +/sys/kernel/debug/tracing/latency_hist/wakeup/max_latency-CPUx.
 262 +
 263 +In addition, other relevant system data at the time when the
 264 +latency occurred are given.
 265 +
 266 +The format of the data is (all in one line):
 267 +<PID> <Priority> <Latency> (<Timeroffset>) <Command> \
 268 +<- <PID> <Priority> <Command> <Timestamp>
 269 +
 270 +The value of <Timeroffset> is only relevant in the combined timer
 271 +and wakeup latency recording. In the wakeup recording, it is
 272 +always 0, in the missed_timer_offsets recording, it is the same
 273 +as <Latency>.
 274 +
 275 +When retrospectively searching for the origin of a latency and
 276 +tracing was not enabled, it may be helpful to know the name and
 277 +some basic data of the task that (finally) was switching to the
 278 +late real-tlme task. In addition to the victim's data, also the
 279 +data of the possible culprit are therefore displayed after the
 280 +"<-" symbol.
 281 +
 282 +Finally, the timestamp of the time when the latency occurred
 283 +in <seconds>.<microseconds> after the most recent system boot
 284 +is provided.
 285 +
 286 +These data are also reset when the wakeup histogram is reset.
 287 diff --git a/Makefile b/Makefile
 288 index b249529204cd..5d699d055995 100644
 289 --- a/Makefile
 290 +++ b/Makefile
 291 @@ -398,12 +398,12 @@ KBUILD_CPPFLAGS := -D__KERNEL__
 292  KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 293                    -fno-strict-aliasing -fno-common \
 294                    -Werror-implicit-function-declaration \
 295 -                  -Wno-format-security \
 296 +                  -Wno-format-security -fno-PIE \
 297                    -std=gnu89
 298
 299  KBUILD_AFLAGS_KERNEL :=
 300  KBUILD_CFLAGS_KERNEL :=
 301 -KBUILD_AFLAGS   := -D__ASSEMBLY__
 302 +KBUILD_AFLAGS   := -D__ASSEMBLY__ -fno-PIE
 303  KBUILD_AFLAGS_MODULE  := -DMODULE
 304  KBUILD_CFLAGS_MODULE  := -DMODULE
 305  KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
 306 diff --git a/arch/Kconfig b/arch/Kconfig
 307 index fd6e9712af81..085134ee13e9 100644
 308 --- a/arch/Kconfig
 309 +++ b/arch/Kconfig
 310 @@ -9,6 +9,7 @@ config OPROFILE
 311         tristate "OProfile system profiling"
 312         depends on PROFILING
 313         depends on HAVE_OPROFILE
 314 +       depends on !PREEMPT_RT_FULL
 315         select RING_BUFFER
 316         select RING_BUFFER_ALLOW_SWAP
 317         help
 318 @@ -52,6 +53,7 @@ config KPROBES
 319  config JUMP_LABEL
 320         bool "Optimize very unlikely/likely branches"
 321         depends on HAVE_ARCH_JUMP_LABEL
 322 +       depends on (!INTERRUPT_OFF_HIST && !PREEMPT_OFF_HIST && !WAKEUP_LATENCY_HIST && !MISSED_TIMER_OFFSETS_HIST)
 323         help
 324           This option enables a transparent branch optimization that
 325          makes certain almost-always-true or almost-always-false branch
 326 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 327 index a9c4e48bb7ec..6eefe4f32302 100644
 328 --- a/arch/arm/Kconfig
 329 +++ b/arch/arm/Kconfig
 330 @@ -36,7 +36,7 @@ config ARM
 331         select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 332         select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 333         select HAVE_ARCH_HARDENED_USERCOPY
 334 -       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
 335 +       select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE
 336         select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 337         select HAVE_ARCH_MMAP_RND_BITS if MMU
 338         select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT)
 339 @@ -75,6 +75,7 @@ config ARM
 340         select HAVE_PERF_EVENTS
 341         select HAVE_PERF_REGS
 342         select HAVE_PERF_USER_STACK_DUMP
 343 +       select HAVE_PREEMPT_LAZY
 344         select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
 345         select HAVE_REGS_AND_STACK_ACCESS_API
 346         select HAVE_SYSCALL_TRACEPOINTS
 347 diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
 348 index 12ebfcc1d539..c962084605bc 100644
 349 --- a/arch/arm/include/asm/switch_to.h
 350 +++ b/arch/arm/include/asm/switch_to.h
 351 @@ -3,6 +3,13 @@
 352
 353  #include <linux/thread_info.h>
 354
 355 +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM
 356 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p);
 357 +#else
 358 +static inline void
 359 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
 360 +#endif
 361 +
 362  /*
 363   * For v7 SMP cores running a preemptible kernel we may be pre-empted
 364   * during a TLB maintenance operation, so execute an inner-shareable dsb
 365 @@ -25,6 +32,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 366  #define switch_to(prev,next,last)                                      \
 367  do {                                                                   \
 368         __complete_pending_tlbi();                                      \
 369 +       switch_kmaps(prev, next);                                       \
 370         last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));        \
 371  } while (0)
 372
 373 diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
 374 index 776757d1604a..1f36a4eccc72 100644
 375 --- a/arch/arm/include/asm/thread_info.h
 376 +++ b/arch/arm/include/asm/thread_info.h
 377 @@ -49,6 +49,7 @@ struct cpu_context_save {
 378  struct thread_info {
 379         unsigned long           flags;          /* low level flags */
 380         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
 381 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
 382         mm_segment_t            addr_limit;     /* address limit */
 383         struct task_struct      *task;          /* main task structure */
 384         __u32                   cpu;            /* cpu */
 385 @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 386  #define TIF_SYSCALL_TRACE      4       /* syscall trace active */
 387  #define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
 388  #define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
 389 -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
 390 +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
 391 +#define TIF_NEED_RESCHED_LAZY  7
 392
 393  #define TIF_NOHZ               12      /* in adaptive nohz mode */
 394  #define TIF_USING_IWMMXT       17
 395 @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 396  #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 397  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 398  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 399 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
 400  #define _TIF_UPROBE            (1 << TIF_UPROBE)
 401  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 402  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 403 @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 404   * Change these and you break ASM code in entry-common.S
 405   */
 406  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 407 -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
 408 +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
 409 +                                _TIF_NEED_RESCHED_LAZY)
 410
 411  #endif /* __KERNEL__ */
 412  #endif /* __ASM_ARM_THREAD_INFO_H */
 413 diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
 414 index 608008229c7d..3866da3f7bb7 100644
 415 --- a/arch/arm/kernel/asm-offsets.c
 416 +++ b/arch/arm/kernel/asm-offsets.c
 417 @@ -65,6 +65,7 @@ int main(void)
 418    BLANK();
 419    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
 420    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
 421 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
 422    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
 423    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
 424    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
 425 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
 426 index 9f157e7c51e7..468e224d76aa 100644
 427 --- a/arch/arm/kernel/entry-armv.S
 428 +++ b/arch/arm/kernel/entry-armv.S
 429 @@ -220,11 +220,18 @@ ENDPROC(__dabt_svc)
 430
 431  #ifdef CONFIG_PREEMPT
 432         ldr     r8, [tsk, #TI_PREEMPT]          @ get preempt count
 433 -       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 434         teq     r8, #0                          @ if preempt count != 0
 435 +       bne     1f                              @ return from exeption
 436 +       ldr     r0, [tsk, #TI_FLAGS]            @ get flags
 437 +       tst     r0, #_TIF_NEED_RESCHED          @ if NEED_RESCHED is set
 438 +       blne    svc_preempt                     @ preempt!
 439 +
 440 +       ldr     r8, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 441 +       teq     r8, #0                          @ if preempt lazy count != 0
 442         movne   r0, #0                          @ force flags to 0
 443 -       tst     r0, #_TIF_NEED_RESCHED
 444 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 445         blne    svc_preempt
 446 +1:
 447  #endif
 448
 449         svc_exit r5, irq = 1                    @ return from exception
 450 @@ -239,8 +246,14 @@ ENDPROC(__irq_svc)
 451  1:     bl      preempt_schedule_irq            @ irq en/disable is done inside
 452         ldr     r0, [tsk, #TI_FLAGS]            @ get new tasks TI_FLAGS
 453         tst     r0, #_TIF_NEED_RESCHED
 454 +       bne     1b
 455 +       tst     r0, #_TIF_NEED_RESCHED_LAZY
 456         reteq   r8                              @ go again
 457 -       b       1b
 458 +       ldr     r0, [tsk, #TI_PREEMPT_LAZY]     @ get preempt lazy count
 459 +       teq     r0, #0                          @ if preempt lazy count != 0
 460 +       beq     1b
 461 +       ret     r8                              @ go again
 462 +
 463  #endif
 464
 465  __und_fault:
 466 diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
 467 index 10c3283d6c19..8872937862cc 100644
 468 --- a/arch/arm/kernel/entry-common.S
 469 +++ b/arch/arm/kernel/entry-common.S
 470 @@ -36,7 +36,9 @@
 471   UNWIND(.cantunwind    )
 472         disable_irq_notrace                     @ disable interrupts
 473         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 474 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 475 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 476 +       bne     fast_work_pending
 477 +       tst     r1, #_TIF_SECCOMP
 478         bne     fast_work_pending
 479
 480         /* perform architecture specific actions before user return */
 481 @@ -62,8 +64,11 @@ ENDPROC(ret_fast_syscall)
 482         str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
 483         disable_irq_notrace                     @ disable interrupts
 484         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
 485 -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 486 +       tst     r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP)
 487 +       bne     do_slower_path
 488 +       tst     r1, #_TIF_SECCOMP
 489         beq     no_work_pending
 490 +do_slower_path:
 491   UNWIND(.fnend         )
 492  ENDPROC(ret_fast_syscall)
 493
 494 diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
 495 index 612eb530f33f..cd3006dc1fd3 100644
 496 --- a/arch/arm/kernel/process.c
 497 +++ b/arch/arm/kernel/process.c
 498 @@ -323,6 +323,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 499  }
 500
 501  #ifdef CONFIG_MMU
 502 +/*
 503 + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock.  If the lock is not
 504 + * initialized by pgtable_page_ctor() then a coredump of the vector page will
 505 + * fail.
 506 + */
 507 +static int __init vectors_user_mapping_init_page(void)
 508 +{
 509 +       struct page *page;
 510 +       unsigned long addr = 0xffff0000;
 511 +       pgd_t *pgd;
 512 +       pud_t *pud;
 513 +       pmd_t *pmd;
 514 +
 515 +       pgd = pgd_offset_k(addr);
 516 +       pud = pud_offset(pgd, addr);
 517 +       pmd = pmd_offset(pud, addr);
 518 +       page = pmd_page(*(pmd));
 519 +
 520 +       pgtable_page_ctor(page);
 521 +
 522 +       return 0;
 523 +}
 524 +late_initcall(vectors_user_mapping_init_page);
 525 +
 526  #ifdef CONFIG_KUSER_HELPERS
 527  /*
 528   * The vectors page is always readable from user space for the
 529 diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
 530 index 7b8f2141427b..96541e00b74a 100644
 531 --- a/arch/arm/kernel/signal.c
 532 +++ b/arch/arm/kernel/signal.c
 533 @@ -572,7 +572,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 534          */
 535         trace_hardirqs_off();
 536         do {
 537 -               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 538 +               if (likely(thread_flags & (_TIF_NEED_RESCHED |
 539 +                                          _TIF_NEED_RESCHED_LAZY))) {
 540                         schedule();
 541                 } else {
 542                         if (unlikely(!user_mode(regs)))
 543 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 544 index 861521606c6d..e5ca865d321b 100644
 545 --- a/arch/arm/kernel/smp.c
 546 +++ b/arch/arm/kernel/smp.c
 547 @@ -234,8 +234,6 @@ int __cpu_disable(void)
 548         flush_cache_louis();
 549         local_flush_tlb_all();
 550
 551 -       clear_tasks_mm_cpumask(cpu);
 552 -
 553         return 0;
 554  }
 555
 556 @@ -251,6 +249,9 @@ void __cpu_die(unsigned int cpu)
 557                 pr_err("CPU%u: cpu didn't die\n", cpu);
 558                 return;
 559         }
 560 +
 561 +       clear_tasks_mm_cpumask(cpu);
 562 +
 563         pr_notice("CPU%u: shutdown\n", cpu);
 564
 565         /*
 566 diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
 567 index 0bee233fef9a..314cfb232a63 100644
 568 --- a/arch/arm/kernel/unwind.c
 569 +++ b/arch/arm/kernel/unwind.c
 570 @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[];
 571  static const struct unwind_idx *__origin_unwind_idx;
 572  extern const struct unwind_idx __stop_unwind_idx[];
 573
 574 -static DEFINE_SPINLOCK(unwind_lock);
 575 +static DEFINE_RAW_SPINLOCK(unwind_lock);
 576  static LIST_HEAD(unwind_tables);
 577
 578  /* Convert a prel31 symbol to an absolute address */
 579 @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 580                 /* module unwind tables */
 581                 struct unwind_table *table;
 582
 583 -               spin_lock_irqsave(&unwind_lock, flags);
 584 +               raw_spin_lock_irqsave(&unwind_lock, flags);
 585                 list_for_each_entry(table, &unwind_tables, list) {
 586                         if (addr >= table->begin_addr &&
 587                             addr < table->end_addr) {
 588 @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr)
 589                                 break;
 590                         }
 591                 }
 592 -               spin_unlock_irqrestore(&unwind_lock, flags);
 593 +               raw_spin_unlock_irqrestore(&unwind_lock, flags);
 594         }
 595
 596         pr_debug("%s: idx = %p\n", __func__, idx);
 597 @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size,
 598         tab->begin_addr = text_addr;
 599         tab->end_addr = text_addr + text_size;
 600
 601 -       spin_lock_irqsave(&unwind_lock, flags);
 602 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 603         list_add_tail(&tab->list, &unwind_tables);
 604 -       spin_unlock_irqrestore(&unwind_lock, flags);
 605 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 606
 607         return tab;
 608  }
 609 @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab)
 610         if (!tab)
 611                 return;
 612
 613 -       spin_lock_irqsave(&unwind_lock, flags);
 614 +       raw_spin_lock_irqsave(&unwind_lock, flags);
 615         list_del(&tab->list);
 616 -       spin_unlock_irqrestore(&unwind_lock, flags);
 617 +       raw_spin_unlock_irqrestore(&unwind_lock, flags);
 618
 619         kfree(tab);
 620  }
 621 diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
 622 index c94b90d43772..244dde72018a 100644
 623 --- a/arch/arm/kvm/arm.c
 624 +++ b/arch/arm/kvm/arm.c
 625 @@ -584,7 +584,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 626                  * involves poking the GIC, which must be done in a
 627                  * non-preemptible context.
 628                  */
 629 -               preempt_disable();
 630 +               migrate_disable();
 631                 kvm_pmu_flush_hwstate(vcpu);
 632                 kvm_timer_flush_hwstate(vcpu);
 633                 kvm_vgic_flush_hwstate(vcpu);
 634 @@ -605,7 +605,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 635                         kvm_pmu_sync_hwstate(vcpu);
 636                         kvm_timer_sync_hwstate(vcpu);
 637                         kvm_vgic_sync_hwstate(vcpu);
 638 -                       preempt_enable();
 639 +                       migrate_enable();
 640                         continue;
 641                 }
 642
 643 @@ -661,7 +661,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 644
 645                 kvm_vgic_sync_hwstate(vcpu);
 646
 647 -               preempt_enable();
 648 +               migrate_enable();
 649
 650                 ret = handle_exit(vcpu, run, ret);
 651         }
 652 diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
 653 index 98ffe1e62ad5..df9769ddece5 100644
 654 --- a/arch/arm/mach-exynos/platsmp.c
 655 +++ b/arch/arm/mach-exynos/platsmp.c
 656 @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void)
 657         return (void __iomem *)(S5P_VA_SCU);
 658  }
 659
 660 -static DEFINE_SPINLOCK(boot_lock);
 661 +static DEFINE_RAW_SPINLOCK(boot_lock);
 662
 663  static void exynos_secondary_init(unsigned int cpu)
 664  {
 665 @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu)
 666         /*
 667          * Synchronise with the boot thread.
 668          */
 669 -       spin_lock(&boot_lock);
 670 -       spin_unlock(&boot_lock);
 671 +       raw_spin_lock(&boot_lock);
 672 +       raw_spin_unlock(&boot_lock);
 673  }
 674
 675  int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr)
 676 @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 677          * Set synchronisation state between this boot processor
 678          * and the secondary one
 679          */
 680 -       spin_lock(&boot_lock);
 681 +       raw_spin_lock(&boot_lock);
 682
 683         /*
 684          * The secondary processor is waiting to be released from
 685 @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 686
 687                 if (timeout == 0) {
 688                         printk(KERN_ERR "cpu1 power enable failed");
 689 -                       spin_unlock(&boot_lock);
 690 +                       raw_spin_unlock(&boot_lock);
 691                         return -ETIMEDOUT;
 692                 }
 693         }
 694 @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle)
 695          * calibrations, then wait for it to finish
 696          */
 697  fail:
 698 -       spin_unlock(&boot_lock);
 699 +       raw_spin_unlock(&boot_lock);
 700
 701         return pen_release != -1 ? ret : 0;
 702  }
 703 diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c
 704 index 4b653a8cb75c..b03d5a922cb1 100644
 705 --- a/arch/arm/mach-hisi/platmcpm.c
 706 +++ b/arch/arm/mach-hisi/platmcpm.c
 707 @@ -61,7 +61,7 @@
 708
 709  static void __iomem *sysctrl, *fabric;
 710  static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER];
 711 -static DEFINE_SPINLOCK(boot_lock);
 712 +static DEFINE_RAW_SPINLOCK(boot_lock);
 713  static u32 fabric_phys_addr;
 714  /*
 715   * [0]: bootwrapper physical address
 716 @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 717         if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER)
 718                 return -EINVAL;
 719
 720 -       spin_lock_irq(&boot_lock);
 721 +       raw_spin_lock_irq(&boot_lock);
 722
 723         if (hip04_cpu_table[cluster][cpu])
 724                 goto out;
 725 @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle)
 726
 727  out:
 728         hip04_cpu_table[cluster][cpu]++;
 729 -       spin_unlock_irq(&boot_lock);
 730 +       raw_spin_unlock_irq(&boot_lock);
 731
 732         return 0;
 733  }
 734 @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu)
 735         cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
 736         cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
 737
 738 -       spin_lock(&boot_lock);
 739 +       raw_spin_lock(&boot_lock);
 740         hip04_cpu_table[cluster][cpu]--;
 741         if (hip04_cpu_table[cluster][cpu] == 1) {
 742                 /* A power_up request went ahead of us. */
 743 -               spin_unlock(&boot_lock);
 744 +               raw_spin_unlock(&boot_lock);
 745                 return;
 746         } else if (hip04_cpu_table[cluster][cpu] > 1) {
 747                 pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu);
 748 @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu)
 749         }
 750
 751         last_man = hip04_cluster_is_down(cluster);
 752 -       spin_unlock(&boot_lock);
 753 +       raw_spin_unlock(&boot_lock);
 754         if (last_man) {
 755                 /* Since it's Cortex A15, disable L2 prefetching. */
 756                 asm volatile(
 757 @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 758                cpu >= HIP04_MAX_CPUS_PER_CLUSTER);
 759
 760         count = TIMEOUT_MSEC / POLL_MSEC;
 761 -       spin_lock_irq(&boot_lock);
 762 +       raw_spin_lock_irq(&boot_lock);
 763         for (tries = 0; tries < count; tries++) {
 764                 if (hip04_cpu_table[cluster][cpu])
 765                         goto err;
 766 @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 767                 data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster));
 768                 if (data & CORE_WFI_STATUS(cpu))
 769                         break;
 770 -               spin_unlock_irq(&boot_lock);
 771 +               raw_spin_unlock_irq(&boot_lock);
 772                 /* Wait for clean L2 when the whole cluster is down. */
 773                 msleep(POLL_MSEC);
 774 -               spin_lock_irq(&boot_lock);
 775 +               raw_spin_lock_irq(&boot_lock);
 776         }
 777         if (tries >= count)
 778                 goto err;
 779 @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu)
 780                 goto err;
 781         if (hip04_cluster_is_down(cluster))
 782                 hip04_set_snoop_filter(cluster, 0);
 783 -       spin_unlock_irq(&boot_lock);
 784 +       raw_spin_unlock_irq(&boot_lock);
 785         return 1;
 786  err:
 787 -       spin_unlock_irq(&boot_lock);
 788 +       raw_spin_unlock_irq(&boot_lock);
 789         return 0;
 790  }
 791  #endif
 792 diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
 793 index b4de3da6dffa..b52893319d75 100644
 794 --- a/arch/arm/mach-omap2/omap-smp.c
 795 +++ b/arch/arm/mach-omap2/omap-smp.c
 796 @@ -64,7 +64,7 @@ static const struct omap_smp_config omap5_cfg __initconst = {
 797         .startup_addr = omap5_secondary_startup,
 798  };
 799
 800 -static DEFINE_SPINLOCK(boot_lock);
 801 +static DEFINE_RAW_SPINLOCK(boot_lock);
 802
 803  void __iomem *omap4_get_scu_base(void)
 804  {
 805 @@ -131,8 +131,8 @@ static void omap4_secondary_init(unsigned int cpu)
 806         /*
 807          * Synchronise with the boot thread.
 808          */
 809 -       spin_lock(&boot_lock);
 810 -       spin_unlock(&boot_lock);
 811 +       raw_spin_lock(&boot_lock);
 812 +       raw_spin_unlock(&boot_lock);
 813  }
 814
 815  static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 816 @@ -146,7 +146,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 817          * Set synchronisation state between this boot processor
 818          * and the secondary one
 819          */
 820 -       spin_lock(&boot_lock);
 821 +       raw_spin_lock(&boot_lock);
 822
 823         /*
 824          * Update the AuxCoreBoot0 with boot state for secondary core.
 825 @@ -223,7 +223,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle)
 826          * Now the secondary core is starting up let it run its
 827          * calibrations, then wait for it to finish
 828          */
 829 -       spin_unlock(&boot_lock);
 830 +       raw_spin_unlock(&boot_lock);
 831
 832         return 0;
 833  }
 834 diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
 835 index 0875b99add18..18b6d98d2581 100644
 836 --- a/arch/arm/mach-prima2/platsmp.c
 837 +++ b/arch/arm/mach-prima2/platsmp.c
 838 @@ -22,7 +22,7 @@
 839
 840  static void __iomem *clk_base;
 841
 842 -static DEFINE_SPINLOCK(boot_lock);
 843 +static DEFINE_RAW_SPINLOCK(boot_lock);
 844
 845  static void sirfsoc_secondary_init(unsigned int cpu)
 846  {
 847 @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu)
 848         /*
 849          * Synchronise with the boot thread.
 850          */
 851 -       spin_lock(&boot_lock);
 852 -       spin_unlock(&boot_lock);
 853 +       raw_spin_lock(&boot_lock);
 854 +       raw_spin_unlock(&boot_lock);
 855  }
 856
 857  static const struct of_device_id clk_ids[]  = {
 858 @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 859         /* make sure write buffer is drained */
 860         mb();
 861
 862 -       spin_lock(&boot_lock);
 863 +       raw_spin_lock(&boot_lock);
 864
 865         /*
 866          * The secondary processor is waiting to be released from
 867 @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle)
 868          * now the secondary core is starting up let it run its
 869          * calibrations, then wait for it to finish
 870          */
 871 -       spin_unlock(&boot_lock);
 872 +       raw_spin_unlock(&boot_lock);
 873
 874         return pen_release != -1 ? -ENOSYS : 0;
 875  }
 876 diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
 877 index 5494c9e0c909..e8ce157d3548 100644
 878 --- a/arch/arm/mach-qcom/platsmp.c
 879 +++ b/arch/arm/mach-qcom/platsmp.c
 880 @@ -46,7 +46,7 @@
 881
 882  extern void secondary_startup_arm(void);
 883
 884 -static DEFINE_SPINLOCK(boot_lock);
 885 +static DEFINE_RAW_SPINLOCK(boot_lock);
 886
 887  #ifdef CONFIG_HOTPLUG_CPU
 888  static void qcom_cpu_die(unsigned int cpu)
 889 @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu)
 890         /*
 891          * Synchronise with the boot thread.
 892          */
 893 -       spin_lock(&boot_lock);
 894 -       spin_unlock(&boot_lock);
 895 +       raw_spin_lock(&boot_lock);
 896 +       raw_spin_unlock(&boot_lock);
 897  }
 898
 899  static int scss_release_secondary(unsigned int cpu)
 900 @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 901          * set synchronisation state between this boot processor
 902          * and the secondary one
 903          */
 904 -       spin_lock(&boot_lock);
 905 +       raw_spin_lock(&boot_lock);
 906
 907         /*
 908          * Send the secondary CPU a soft interrupt, thereby causing
 909 @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int))
 910          * now the secondary core is starting up let it run its
 911          * calibrations, then wait for it to finish
 912          */
 913 -       spin_unlock(&boot_lock);
 914 +       raw_spin_unlock(&boot_lock);
 915
 916         return ret;
 917  }
 918 diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c
 919 index 8d1e2d551786..7fa56cc78118 100644
 920 --- a/arch/arm/mach-spear/platsmp.c
 921 +++ b/arch/arm/mach-spear/platsmp.c
 922 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
 923         sync_cache_w(&pen_release);
 924  }
 925
 926 -static DEFINE_SPINLOCK(boot_lock);
 927 +static DEFINE_RAW_SPINLOCK(boot_lock);
 928
 929  static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 930
 931 @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu)
 932         /*
 933          * Synchronise with the boot thread.
 934          */
 935 -       spin_lock(&boot_lock);
 936 -       spin_unlock(&boot_lock);
 937 +       raw_spin_lock(&boot_lock);
 938 +       raw_spin_unlock(&boot_lock);
 939  }
 940
 941  static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 942 @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 943          * set synchronisation state between this boot processor
 944          * and the secondary one
 945          */
 946 -       spin_lock(&boot_lock);
 947 +       raw_spin_lock(&boot_lock);
 948
 949         /*
 950          * The secondary processor is waiting to be released from
 951 @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 952          * now the secondary core is starting up let it run its
 953          * calibrations, then wait for it to finish
 954          */
 955 -       spin_unlock(&boot_lock);
 956 +       raw_spin_unlock(&boot_lock);
 957
 958         return pen_release != -1 ? -ENOSYS : 0;
 959  }
 960 diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c
 961 index ea5a2277ee46..b988e081ac79 100644
 962 --- a/arch/arm/mach-sti/platsmp.c
 963 +++ b/arch/arm/mach-sti/platsmp.c
 964 @@ -35,7 +35,7 @@ static void write_pen_release(int val)
 965         sync_cache_w(&pen_release);
 966  }
 967
 968 -static DEFINE_SPINLOCK(boot_lock);
 969 +static DEFINE_RAW_SPINLOCK(boot_lock);
 970
 971  static void sti_secondary_init(unsigned int cpu)
 972  {
 973 @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu)
 974         /*
 975          * Synchronise with the boot thread.
 976          */
 977 -       spin_lock(&boot_lock);
 978 -       spin_unlock(&boot_lock);
 979 +       raw_spin_lock(&boot_lock);
 980 +       raw_spin_unlock(&boot_lock);
 981  }
 982
 983  static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 984 @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 985          * set synchronisation state between this boot processor
 986          * and the secondary one
 987          */
 988 -       spin_lock(&boot_lock);
 989 +       raw_spin_lock(&boot_lock);
 990
 991         /*
 992          * The secondary processor is waiting to be released from
 993 @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle)
 994          * now the secondary core is starting up let it run its
 995          * calibrations, then wait for it to finish
 996          */
 997 -       spin_unlock(&boot_lock);
 998 +       raw_spin_unlock(&boot_lock);
 999
1000         return pen_release != -1 ? -ENOSYS : 0;
1001  }
1002 diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
1003 index 3a2e678b8d30..3ed1e9ba6a01 100644
1004 --- a/arch/arm/mm/fault.c
1005 +++ b/arch/arm/mm/fault.c
1006 @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1007         if (addr < TASK_SIZE)
1008                 return do_page_fault(addr, fsr, regs);
1009
1010 +       if (interrupts_enabled(regs))
1011 +               local_irq_enable();
1012 +
1013         if (user_mode(regs))
1014                 goto bad_area;
1015
1016 @@ -497,6 +500,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr,
1017  static int
1018  do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
1019  {
1020 +       if (interrupts_enabled(regs))
1021 +               local_irq_enable();
1022 +
1023         do_bad_area(addr, fsr, regs);
1024         return 0;
1025  }
1026 diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
1027 index d02f8187b1cc..542692dbd40a 100644
1028 --- a/arch/arm/mm/highmem.c
1029 +++ b/arch/arm/mm/highmem.c
1030 @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr)
1031         return *ptep;
1032  }
1033
1034 +static unsigned int fixmap_idx(int type)
1035 +{
1036 +       return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1037 +}
1038 +
1039  void *kmap(struct page *page)
1040  {
1041         might_sleep();
1042 @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap);
1043
1044  void *kmap_atomic(struct page *page)
1045  {
1046 +       pte_t pte = mk_pte(page, kmap_prot);
1047         unsigned int idx;
1048         unsigned long vaddr;
1049         void *kmap;
1050         int type;
1051
1052 -       preempt_disable();
1053 +       preempt_disable_nort();
1054         pagefault_disable();
1055         if (!PageHighMem(page))
1056                 return page_address(page);
1057 @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page)
1058
1059         type = kmap_atomic_idx_push();
1060
1061 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1062 +       idx = fixmap_idx(type);
1063         vaddr = __fix_to_virt(idx);
1064  #ifdef CONFIG_DEBUG_HIGHMEM
1065         /*
1066 @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page)
1067          * in place, so the contained TLB flush ensures the TLB is updated
1068          * with the new mapping.
1069          */
1070 -       set_fixmap_pte(idx, mk_pte(page, kmap_prot));
1071 +#ifdef CONFIG_PREEMPT_RT_FULL
1072 +       current->kmap_pte[type] = pte;
1073 +#endif
1074 +       set_fixmap_pte(idx, pte);
1075
1076         return (void *)vaddr;
1077  }
1078 @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr)
1079
1080         if (kvaddr >= (void *)FIXADDR_START) {
1081                 type = kmap_atomic_idx();
1082 -               idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1083 +               idx = fixmap_idx(type);
1084
1085                 if (cache_is_vivt())
1086                         __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE);
1087 +#ifdef CONFIG_PREEMPT_RT_FULL
1088 +               current->kmap_pte[type] = __pte(0);
1089 +#endif
1090  #ifdef CONFIG_DEBUG_HIGHMEM
1091                 BUG_ON(vaddr != __fix_to_virt(idx));
1092 -               set_fixmap_pte(idx, __pte(0));
1093  #else
1094                 (void) idx;  /* to kill a warning */
1095  #endif
1096 +               set_fixmap_pte(idx, __pte(0));
1097                 kmap_atomic_idx_pop();
1098         } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
1099                 /* this address was obtained through kmap_high_get() */
1100                 kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
1101         }
1102         pagefault_enable();
1103 -       preempt_enable();
1104 +       preempt_enable_nort();
1105  }
1106  EXPORT_SYMBOL(__kunmap_atomic);
1107
1108  void *kmap_atomic_pfn(unsigned long pfn)
1109  {
1110 +       pte_t pte = pfn_pte(pfn, kmap_prot);
1111         unsigned long vaddr;
1112         int idx, type;
1113         struct page *page = pfn_to_page(pfn);
1114
1115 -       preempt_disable();
1116 +       preempt_disable_nort();
1117         pagefault_disable();
1118         if (!PageHighMem(page))
1119                 return page_address(page);
1120
1121         type = kmap_atomic_idx_push();
1122 -       idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id();
1123 +       idx = fixmap_idx(type);
1124         vaddr = __fix_to_virt(idx);
1125  #ifdef CONFIG_DEBUG_HIGHMEM
1126         BUG_ON(!pte_none(get_fixmap_pte(vaddr)));
1127  #endif
1128 -       set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot));
1129 +#ifdef CONFIG_PREEMPT_RT_FULL
1130 +       current->kmap_pte[type] = pte;
1131 +#endif
1132 +       set_fixmap_pte(idx, pte);
1133
1134         return (void *)vaddr;
1135  }
1136 +#if defined CONFIG_PREEMPT_RT_FULL
1137 +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
1138 +{
1139 +       int i;
1140 +
1141 +       /*
1142 +        * Clear @prev's kmap_atomic mappings
1143 +        */
1144 +       for (i = 0; i < prev_p->kmap_idx; i++) {
1145 +               int idx = fixmap_idx(i);
1146 +
1147 +               set_fixmap_pte(idx, __pte(0));
1148 +       }
1149 +       /*
1150 +        * Restore @next_p's kmap_atomic mappings
1151 +        */
1152 +       for (i = 0; i < next_p->kmap_idx; i++) {
1153 +               int idx = fixmap_idx(i);
1154 +
1155 +               if (!pte_none(next_p->kmap_pte[i]))
1156 +                       set_fixmap_pte(idx, next_p->kmap_pte[i]);
1157 +       }
1158 +}
1159 +#endif
1160 diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
1161 index c2366510187a..6b60f582b738 100644
1162 --- a/arch/arm/plat-versatile/platsmp.c
1163 +++ b/arch/arm/plat-versatile/platsmp.c
1164 @@ -32,7 +32,7 @@ static void write_pen_release(int val)
1165         sync_cache_w(&pen_release);
1166  }
1167
1168 -static DEFINE_SPINLOCK(boot_lock);
1169 +static DEFINE_RAW_SPINLOCK(boot_lock);
1170
1171  void versatile_secondary_init(unsigned int cpu)
1172  {
1173 @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu)
1174         /*
1175          * Synchronise with the boot thread.
1176          */
1177 -       spin_lock(&boot_lock);
1178 -       spin_unlock(&boot_lock);
1179 +       raw_spin_lock(&boot_lock);
1180 +       raw_spin_unlock(&boot_lock);
1181  }
1182
1183  int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1184 @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1185          * Set synchronisation state between this boot processor
1186          * and the secondary one
1187          */
1188 -       spin_lock(&boot_lock);
1189 +       raw_spin_lock(&boot_lock);
1190
1191         /*
1192          * This is really belt and braces; we hold unintended secondary
1193 @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle)
1194          * now the secondary core is starting up let it run its
1195          * calibrations, then wait for it to finish
1196          */
1197 -       spin_unlock(&boot_lock);
1198 +       raw_spin_unlock(&boot_lock);
1199
1200         return pen_release != -1 ? -ENOSYS : 0;
1201  }
1202 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
1203 index bc3f00f586f1..0f3df6d5154a 100644
1204 --- a/arch/arm64/Kconfig
1205 +++ b/arch/arm64/Kconfig
1206 @@ -90,6 +90,7 @@ config ARM64
1207         select HAVE_PERF_EVENTS
1208         select HAVE_PERF_REGS
1209         select HAVE_PERF_USER_STACK_DUMP
1210 +       select HAVE_PREEMPT_LAZY
1211         select HAVE_REGS_AND_STACK_ACCESS_API
1212         select HAVE_RCU_TABLE_FREE
1213         select HAVE_SYSCALL_TRACEPOINTS
1214 @@ -689,7 +690,7 @@ config XEN_DOM0
1215
1216  config XEN
1217         bool "Xen guest support on ARM64"
1218 -       depends on ARM64 && OF
1219 +       depends on ARM64 && OF && !PREEMPT_RT_FULL
1220         select SWIOTLB_XEN
1221         select PARAVIRT
1222         help
1223 diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
1224 index abd64bd1f6d9..9170788ffa37 100644
1225 --- a/arch/arm64/include/asm/thread_info.h
1226 +++ b/arch/arm64/include/asm/thread_info.h
1227 @@ -49,6 +49,7 @@ struct thread_info {
1228         mm_segment_t            addr_limit;     /* address limit */
1229         struct task_struct      *task;          /* main task structure */
1230         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
1231 +       int                     preempt_lazy_count; /* 0 => preemptable, <0 => bug */
1232         int                     cpu;            /* cpu */
1233  };
1234
1235 @@ -109,6 +110,7 @@ static inline struct thread_info *current_thread_info(void)
1236  #define TIF_NEED_RESCHED       1
1237  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
1238  #define TIF_FOREIGN_FPSTATE    3       /* CPU's FP state is not current's */
1239 +#define TIF_NEED_RESCHED_LAZY  4
1240  #define TIF_NOHZ               7
1241  #define TIF_SYSCALL_TRACE      8
1242  #define TIF_SYSCALL_AUDIT      9
1243 @@ -124,6 +126,7 @@ static inline struct thread_info *current_thread_info(void)
1244  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
1245  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
1246  #define _TIF_FOREIGN_FPSTATE   (1 << TIF_FOREIGN_FPSTATE)
1247 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
1248  #define _TIF_NOHZ              (1 << TIF_NOHZ)
1249  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
1250  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
1251 @@ -132,7 +135,8 @@ static inline struct thread_info *current_thread_info(void)
1252  #define _TIF_32BIT             (1 << TIF_32BIT)
1253
1254  #define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
1255 -                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
1256 +                                _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
1257 +                                _TIF_NEED_RESCHED_LAZY)
1258
1259  #define _TIF_SYSCALL_WORK      (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1260                                  _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
1261 diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
1262 index 05070b72fc28..acfeddb1283a 100644
1263 --- a/arch/arm64/kernel/asm-offsets.c
1264 +++ b/arch/arm64/kernel/asm-offsets.c
1265 @@ -37,6 +37,7 @@ int main(void)
1266    BLANK();
1267    DEFINE(TI_FLAGS,             offsetof(struct thread_info, flags));
1268    DEFINE(TI_PREEMPT,           offsetof(struct thread_info, preempt_count));
1269 +  DEFINE(TI_PREEMPT_LAZY,      offsetof(struct thread_info, preempt_lazy_count));
1270    DEFINE(TI_ADDR_LIMIT,                offsetof(struct thread_info, addr_limit));
1271    DEFINE(TI_TASK,              offsetof(struct thread_info, task));
1272    DEFINE(TI_CPU,               offsetof(struct thread_info, cpu));
1273 diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
1274 index 441420ca7d08..404792bdca99 100644
1275 --- a/arch/arm64/kernel/entry.S
1276 +++ b/arch/arm64/kernel/entry.S
1277 @@ -434,11 +434,16 @@ ENDPROC(el1_sync)
1278
1279  #ifdef CONFIG_PREEMPT
1280         ldr     w24, [tsk, #TI_PREEMPT]         // get preempt count
1281 -       cbnz    w24, 1f                         // preempt count != 0
1282 +       cbnz    w24, 2f                         // preempt count != 0
1283         ldr     x0, [tsk, #TI_FLAGS]            // get flags
1284 -       tbz     x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1285 -       bl      el1_preempt
1286 +       tbnz    x0, #TIF_NEED_RESCHED, 1f       // needs rescheduling?
1287 +
1288 +       ldr     w24, [tsk, #TI_PREEMPT_LAZY]    // get preempt lazy count
1289 +       cbnz    w24, 2f                         // preempt lazy count != 0
1290 +       tbz     x0, #TIF_NEED_RESCHED_LAZY, 2f  // needs rescheduling?
1291  1:
1292 +       bl      el1_preempt
1293 +2:
1294  #endif
1295  #ifdef CONFIG_TRACE_IRQFLAGS
1296         bl      trace_hardirqs_on
1297 @@ -452,6 +457,7 @@ ENDPROC(el1_irq)
1298  1:     bl      preempt_schedule_irq            // irq en/disable is done inside
1299         ldr     x0, [tsk, #TI_FLAGS]            // get new tasks TI_FLAGS
1300         tbnz    x0, #TIF_NEED_RESCHED, 1b       // needs rescheduling?
1301 +       tbnz    x0, #TIF_NEED_RESCHED_LAZY, 1b  // needs rescheduling?
1302         ret     x24
1303  #endif
1304
1305 @@ -708,6 +714,7 @@ ENDPROC(cpu_switch_to)
1306   */
1307  work_pending:
1308         tbnz    x1, #TIF_NEED_RESCHED, work_resched
1309 +       tbnz    x1, #TIF_NEED_RESCHED_LAZY, work_resched
1310         /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
1311         mov     x0, sp                          // 'regs'
1312         enable_irq                              // enable interrupts for do_notify_resume()
1313 diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
1314 index 212ff92920d2..71ad38d3d76b 100644
1315 --- a/arch/mips/Kconfig
1316 +++ b/arch/mips/Kconfig
1317 @@ -2480,7 +2480,7 @@ config MIPS_ASID_BITS_VARIABLE
1318  #
1319  config HIGHMEM
1320         bool "High Memory Support"
1321 -       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA
1322 +       depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL
1323
1324  config CPU_SUPPORTS_HIGHMEM
1325         bool
1326 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
1327 index 792cb1768c8f..ddf5a0fdb25a 100644
1328 --- a/arch/powerpc/Kconfig
1329 +++ b/arch/powerpc/Kconfig
1330 @@ -57,10 +57,11 @@ config LOCKDEP_SUPPORT
1331
1332  config RWSEM_GENERIC_SPINLOCK
1333         bool
1334 +       default y if PREEMPT_RT_FULL
1335
1336  config RWSEM_XCHGADD_ALGORITHM
1337         bool
1338 -       default y
1339 +       default y if !PREEMPT_RT_FULL
1340
1341  config GENERIC_LOCKBREAK
1342         bool
1343 @@ -140,6 +141,7 @@ config PPC
1344         select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
1345         select GENERIC_STRNCPY_FROM_USER
1346         select GENERIC_STRNLEN_USER
1347 +       select HAVE_PREEMPT_LAZY
1348         select HAVE_MOD_ARCH_SPECIFIC
1349         select MODULES_USE_ELF_RELA
1350         select CLONE_BACKWARDS
1351 @@ -326,7 +328,7 @@ menu "Kernel options"
1352
1353  config HIGHMEM
1354         bool "High memory support"
1355 -       depends on PPC32
1356 +       depends on PPC32 && !PREEMPT_RT_FULL
1357
1358  source kernel/Kconfig.hz
1359  source kernel/Kconfig.preempt
1360 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
1361 index 87e4b2d8dcd4..981e501a4359 100644
1362 --- a/arch/powerpc/include/asm/thread_info.h
1363 +++ b/arch/powerpc/include/asm/thread_info.h
1364 @@ -43,6 +43,8 @@ struct thread_info {
1365         int             cpu;                    /* cpu we're on */
1366         int             preempt_count;          /* 0 => preemptable,
1367                                                    <0 => BUG */
1368 +       int             preempt_lazy_count;     /* 0 => preemptable,
1369 +                                                  <0 => BUG */
1370         unsigned long   local_flags;            /* private flags for thread */
1371  #ifdef CONFIG_LIVEPATCH
1372         unsigned long *livepatch_sp;
1373 @@ -88,8 +90,7 @@ static inline struct thread_info *current_thread_info(void)
1374  #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
1375  #define TIF_SIGPENDING         1       /* signal pending */
1376  #define TIF_NEED_RESCHED       2       /* rescheduling necessary */
1377 -#define TIF_POLLING_NRFLAG     3       /* true if poll_idle() is polling
1378 -                                          TIF_NEED_RESCHED */
1379 +#define TIF_NEED_RESCHED_LAZY  3       /* lazy rescheduling necessary */
1380  #define TIF_32BIT              4       /* 32 bit binary */
1381  #define TIF_RESTORE_TM         5       /* need to restore TM FP/VEC/VSX */
1382  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
1383 @@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
1384  #if defined(CONFIG_PPC64)
1385  #define TIF_ELF2ABI            18      /* function descriptors must die! */
1386  #endif
1387 +#define TIF_POLLING_NRFLAG     19      /* true if poll_idle() is polling
1388 +                                          TIF_NEED_RESCHED */
1389
1390  /* as above, but as bit values */
1391  #define _TIF_SYSCALL_TRACE     (1<<TIF_SYSCALL_TRACE)
1392 @@ -125,14 +128,16 @@ static inline struct thread_info *current_thread_info(void)
1393  #define _TIF_SYSCALL_TRACEPOINT        (1<<TIF_SYSCALL_TRACEPOINT)
1394  #define _TIF_EMULATE_STACK_STORE       (1<<TIF_EMULATE_STACK_STORE)
1395  #define _TIF_NOHZ              (1<<TIF_NOHZ)
1396 +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY)
1397  #define _TIF_SYSCALL_DOTRACE   (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
1398                                  _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
1399                                  _TIF_NOHZ)
1400
1401  #define _TIF_USER_WORK_MASK    (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
1402                                  _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
1403 -                                _TIF_RESTORE_TM)
1404 +                                _TIF_RESTORE_TM | _TIF_NEED_RESCHED_LAZY)
1405  #define _TIF_PERSYSCALL_MASK   (_TIF_RESTOREALL|_TIF_NOERROR)
1406 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
1407
1408  /* Bits in local_flags */
1409  /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */
1410 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
1411 index b89d14c0352c..81ae8f4c88f6 100644
1412 --- a/arch/powerpc/kernel/asm-offsets.c
1413 +++ b/arch/powerpc/kernel/asm-offsets.c
1414 @@ -156,6 +156,7 @@ int main(void)
1415         DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
1416         DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
1417         DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
1418 +       DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count));
1419         DEFINE(TI_TASK, offsetof(struct thread_info, task));
1420         DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
1421
1422 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
1423 index 9899032230b4..f95b93f46c47 100644
1424 --- a/arch/powerpc/kernel/entry_32.S
1425 +++ b/arch/powerpc/kernel/entry_32.S
1426 @@ -835,7 +835,14 @@ user_exc_return:           /* r10 contains MSR_KERNEL here */
1427         cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1428         bne     restore
1429         andi.   r8,r8,_TIF_NEED_RESCHED
1430 +       bne+    1f
1431 +       lwz     r0,TI_PREEMPT_LAZY(r9)
1432 +       cmpwi   0,r0,0          /* if non-zero, just restore regs and return */
1433 +       bne     restore
1434 +       lwz     r0,TI_FLAGS(r9)
1435 +       andi.   r0,r0,_TIF_NEED_RESCHED_LAZY
1436         beq+    restore
1437 +1:
1438         lwz     r3,_MSR(r1)
1439         andi.   r0,r3,MSR_EE    /* interrupts off? */
1440         beq     restore         /* don't schedule if so */
1441 @@ -846,11 +853,11 @@ user_exc_return:          /* r10 contains MSR_KERNEL here */
1442          */
1443         bl      trace_hardirqs_off
1444  #endif
1445 -1:     bl      preempt_schedule_irq
1446 +2:     bl      preempt_schedule_irq
1447         CURRENT_THREAD_INFO(r9, r1)
1448         lwz     r3,TI_FLAGS(r9)
1449 -       andi.   r0,r3,_TIF_NEED_RESCHED
1450 -       bne-    1b
1451 +       andi.   r0,r3,_TIF_NEED_RESCHED_MASK
1452 +       bne-    2b
1453  #ifdef CONFIG_TRACE_IRQFLAGS
1454         /* And now, to properly rebalance the above, we tell lockdep they
1455          * are being turned back on, which will happen when we return
1456 @@ -1171,7 +1178,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
1457  #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
1458
1459  do_work:                       /* r10 contains MSR_KERNEL here */
1460 -       andi.   r0,r9,_TIF_NEED_RESCHED
1461 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1462         beq     do_user_signal
1463
1464  do_resched:                    /* r10 contains MSR_KERNEL here */
1465 @@ -1192,7 +1199,7 @@ do_resched:                       /* r10 contains MSR_KERNEL here */
1466         MTMSRD(r10)             /* disable interrupts */
1467         CURRENT_THREAD_INFO(r9, r1)
1468         lwz     r9,TI_FLAGS(r9)
1469 -       andi.   r0,r9,_TIF_NEED_RESCHED
1470 +       andi.   r0,r9,_TIF_NEED_RESCHED_MASK
1471         bne-    do_resched
1472         andi.   r0,r9,_TIF_USER_WORK_MASK
1473         beq     restore_user
1474 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
1475 index 5afd03e5e8b8..f5d4c2a033ef 100644
1476 --- a/arch/powerpc/kernel/entry_64.S
1477 +++ b/arch/powerpc/kernel/entry_64.S
1478 @@ -657,7 +657,7 @@ _GLOBAL(ret_from_except_lite)
1479         bl      restore_math
1480         b       restore
1481  #endif
1482 -1:     andi.   r0,r4,_TIF_NEED_RESCHED
1483 +1:     andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1484         beq     2f
1485         bl      restore_interrupts
1486         SCHEDULE_USER
1487 @@ -719,10 +719,18 @@ _GLOBAL(ret_from_except_lite)
1488
1489  #ifdef CONFIG_PREEMPT
1490         /* Check if we need to preempt */
1491 -       andi.   r0,r4,_TIF_NEED_RESCHED
1492 -       beq+    restore
1493 -       /* Check that preempt_count() == 0 and interrupts are enabled */
1494         lwz     r8,TI_PREEMPT(r9)
1495 +       cmpwi   0,r8,0          /* if non-zero, just restore regs and return */
1496 +       bne     restore
1497 +       andi.   r0,r4,_TIF_NEED_RESCHED
1498 +       bne+    check_count
1499 +
1500 +       andi.   r0,r4,_TIF_NEED_RESCHED_LAZY
1501 +       beq+    restore
1502 +       lwz     r8,TI_PREEMPT_LAZY(r9)
1503 +
1504 +       /* Check that preempt_count() == 0 and interrupts are enabled */
1505 +check_count:
1506         cmpwi   cr1,r8,0
1507         ld      r0,SOFTE(r1)
1508         cmpdi   r0,0
1509 @@ -739,7 +747,7 @@ _GLOBAL(ret_from_except_lite)
1510         /* Re-test flags and eventually loop */
1511         CURRENT_THREAD_INFO(r9, r1)
1512         ld      r4,TI_FLAGS(r9)
1513 -       andi.   r0,r4,_TIF_NEED_RESCHED
1514 +       andi.   r0,r4,_TIF_NEED_RESCHED_MASK
1515         bne     1b
1516
1517         /*
1518 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
1519 index 08887cf2b20e..f1770ea2d094 100644
1520 --- a/arch/powerpc/kernel/irq.c
1521 +++ b/arch/powerpc/kernel/irq.c
1522 @@ -633,6 +633,7 @@ void irq_ctx_init(void)
1523         }
1524  }
1525
1526 +#ifndef CONFIG_PREEMPT_RT_FULL
1527  void do_softirq_own_stack(void)
1528  {
1529         struct thread_info *curtp, *irqtp;
1530 @@ -650,6 +651,7 @@ void do_softirq_own_stack(void)
1531         if (irqtp->flags)
1532                 set_bits(irqtp->flags, &curtp->flags);
1533  }
1534 +#endif
1535
1536  irq_hw_number_t virq_to_hw(unsigned int virq)
1537  {
1538 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
1539 index d9c912b6e632..7b2e997a5083 100644
1540 --- a/arch/powerpc/kernel/misc_32.S
1541 +++ b/arch/powerpc/kernel/misc_32.S
1542 @@ -40,6 +40,7 @@
1543   * We store the saved ksp_limit in the unused part
1544   * of the STACK_FRAME_OVERHEAD
1545   */
1546 +#ifndef CONFIG_PREEMPT_RT_FULL
1547  _GLOBAL(call_do_softirq)
1548         mflr    r0
1549         stw     r0,4(r1)
1550 @@ -56,6 +57,7 @@ _GLOBAL(call_do_softirq)
1551         stw     r10,THREAD+KSP_LIMIT(r2)
1552         mtlr    r0
1553         blr
1554 +#endif
1555
1556  /*
1557   * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp);
1558 diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
1559 index cb195157b318..c919a2bfd0ca 100644
1560 --- a/arch/powerpc/kernel/misc_64.S
1561 +++ b/arch/powerpc/kernel/misc_64.S
1562 @@ -30,6 +30,7 @@
1563
1564         .text
1565
1566 +#ifndef CONFIG_PREEMPT_RT_FULL
1567  _GLOBAL(call_do_softirq)
1568         mflr    r0
1569         std     r0,16(r1)
1570 @@ -40,6 +41,7 @@ _GLOBAL(call_do_softirq)
1571         ld      r0,16(r1)
1572         mtlr    r0
1573         blr
1574 +#endif
1575
1576  _GLOBAL(call_do_irq)
1577         mflr    r0
1578 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
1579 index c2024ac9d4e8..2303788da7e1 100644
1580 --- a/arch/powerpc/kvm/Kconfig
1581 +++ b/arch/powerpc/kvm/Kconfig
1582 @@ -172,6 +172,7 @@ config KVM_E500MC
1583  config KVM_MPIC
1584         bool "KVM in-kernel MPIC emulation"
1585         depends on KVM && E500
1586 +       depends on !PREEMPT_RT_FULL
1587         select HAVE_KVM_IRQCHIP
1588         select HAVE_KVM_IRQFD
1589         select HAVE_KVM_IRQ_ROUTING
1590 diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
1591 index 57caaf11a83f..030c9bfe52e3 100644
1592 --- a/arch/powerpc/platforms/ps3/device-init.c
1593 +++ b/arch/powerpc/platforms/ps3/device-init.c
1594 @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev,
1595         }
1596         pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op);
1597
1598 -       res = wait_event_interruptible(dev->done.wait,
1599 +       res = swait_event_interruptible(dev->done.wait,
1600                                        dev->done.done || kthread_should_stop());
1601         if (kthread_should_stop())
1602                 res = -EINTR;
1603 diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
1604 index 6c0378c0b8b5..abd58b4dff97 100644
1605 --- a/arch/sh/kernel/irq.c
1606 +++ b/arch/sh/kernel/irq.c
1607 @@ -147,6 +147,7 @@ void irq_ctx_exit(int cpu)
1608         hardirq_ctx[cpu] = NULL;
1609  }
1610
1611 +#ifndef CONFIG_PREEMPT_RT_FULL
1612  void do_softirq_own_stack(void)
1613  {
1614         struct thread_info *curctx;
1615 @@ -174,6 +175,7 @@ void do_softirq_own_stack(void)
1616                   "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr"
1617         );
1618  }
1619 +#endif
1620  #else
1621  static inline void handle_one_irq(unsigned int irq)
1622  {
1623 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
1624 index 59b09600dd32..1b073eb3dc2a 100644
1625 --- a/arch/sparc/Kconfig
1626 +++ b/arch/sparc/Kconfig
1627 @@ -187,12 +187,10 @@ config NR_CPUS
1628  source kernel/Kconfig.hz
1629
1630  config RWSEM_GENERIC_SPINLOCK
1631 -       bool
1632 -       default y if SPARC32
1633 +       def_bool PREEMPT_RT_FULL
1634
1635  config RWSEM_XCHGADD_ALGORITHM
1636 -       bool
1637 -       default y if SPARC64
1638 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1639
1640  config GENERIC_HWEIGHT
1641         bool
1642 diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
1643 index 34a7930b76ef..773740521008 100644
1644 --- a/arch/sparc/kernel/irq_64.c
1645 +++ b/arch/sparc/kernel/irq_64.c
1646 @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs)
1647         set_irq_regs(old_regs);
1648  }
1649
1650 +#ifndef CONFIG_PREEMPT_RT_FULL
1651  void do_softirq_own_stack(void)
1652  {
1653         void *orig_sp, *sp = softirq_stack[smp_processor_id()];
1654 @@ -868,6 +869,7 @@ void do_softirq_own_stack(void)
1655         __asm__ __volatile__("mov %0, %%sp"
1656                              : : "r" (orig_sp));
1657  }
1658 +#endif
1659
1660  #ifdef CONFIG_HOTPLUG_CPU
1661  void fixup_irqs(void)
1662 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
1663 index 2a1f0ce7c59a..bd4ab87efb31 100644
1664 --- a/arch/x86/Kconfig
1665 +++ b/arch/x86/Kconfig
1666 @@ -17,6 +17,7 @@ config X86_64
1667  ### Arch settings
1668  config X86
1669         def_bool y
1670 +       select HAVE_PREEMPT_LAZY
1671         select ACPI_LEGACY_TABLES_LOOKUP        if ACPI
1672         select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
1673         select ANON_INODES
1674 @@ -231,8 +232,11 @@ config ARCH_MAY_HAVE_PC_FDC
1675         def_bool y
1676         depends on ISA_DMA_API
1677
1678 +config RWSEM_GENERIC_SPINLOCK
1679 +       def_bool PREEMPT_RT_FULL
1680 +
1681  config RWSEM_XCHGADD_ALGORITHM
1682 -       def_bool y
1683 +       def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL
1684
1685  config GENERIC_CALIBRATE_DELAY
1686         def_bool y
1687 @@ -885,7 +889,7 @@ config IOMMU_HELPER
1688  config MAXSMP
1689         bool "Enable Maximum number of SMP Processors and NUMA Nodes"
1690         depends on X86_64 && SMP && DEBUG_KERNEL
1691 -       select CPUMASK_OFFSTACK
1692 +       select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL
1693         ---help---
1694           Enable maximum number of CPUS and NUMA Nodes for this architecture.
1695           If unsure, say N.
1696 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
1697 index 0ab5ee1c26af..fff8f6f1f90c 100644
1698 --- a/arch/x86/crypto/aesni-intel_glue.c
1699 +++ b/arch/x86/crypto/aesni-intel_glue.c
1700 @@ -372,14 +372,14 @@ static int ecb_encrypt(struct blkcipher_desc *desc,
1701         err = blkcipher_walk_virt(desc, &walk);
1702         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1703
1704 -       kernel_fpu_begin();
1705         while ((nbytes = walk.nbytes)) {
1706 +               kernel_fpu_begin();
1707                 aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1708 -                             nbytes & AES_BLOCK_MASK);
1709 +                               nbytes & AES_BLOCK_MASK);
1710 +               kernel_fpu_end();
1711                 nbytes &= AES_BLOCK_SIZE - 1;
1712                 err = blkcipher_walk_done(desc, &walk, nbytes);
1713         }
1714 -       kernel_fpu_end();
1715
1716         return err;
1717  }
1718 @@ -396,14 +396,14 @@ static int ecb_decrypt(struct blkcipher_desc *desc,
1719         err = blkcipher_walk_virt(desc, &walk);
1720         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1721
1722 -       kernel_fpu_begin();
1723         while ((nbytes = walk.nbytes)) {
1724 +               kernel_fpu_begin();
1725                 aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1726                               nbytes & AES_BLOCK_MASK);
1727 +               kernel_fpu_end();
1728                 nbytes &= AES_BLOCK_SIZE - 1;
1729                 err = blkcipher_walk_done(desc, &walk, nbytes);
1730         }
1731 -       kernel_fpu_end();
1732
1733         return err;
1734  }
1735 @@ -420,14 +420,14 @@ static int cbc_encrypt(struct blkcipher_desc *desc,
1736         err = blkcipher_walk_virt(desc, &walk);
1737         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1738
1739 -       kernel_fpu_begin();
1740         while ((nbytes = walk.nbytes)) {
1741 +               kernel_fpu_begin();
1742                 aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1743                               nbytes & AES_BLOCK_MASK, walk.iv);
1744 +               kernel_fpu_end();
1745                 nbytes &= AES_BLOCK_SIZE - 1;
1746                 err = blkcipher_walk_done(desc, &walk, nbytes);
1747         }
1748 -       kernel_fpu_end();
1749
1750         return err;
1751  }
1752 @@ -444,14 +444,14 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
1753         err = blkcipher_walk_virt(desc, &walk);
1754         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1755
1756 -       kernel_fpu_begin();
1757         while ((nbytes = walk.nbytes)) {
1758 +               kernel_fpu_begin();
1759                 aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1760                               nbytes & AES_BLOCK_MASK, walk.iv);
1761 +               kernel_fpu_end();
1762                 nbytes &= AES_BLOCK_SIZE - 1;
1763                 err = blkcipher_walk_done(desc, &walk, nbytes);
1764         }
1765 -       kernel_fpu_end();
1766
1767         return err;
1768  }
1769 @@ -503,18 +503,20 @@ static int ctr_crypt(struct blkcipher_desc *desc,
1770         err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
1771         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1772
1773 -       kernel_fpu_begin();
1774         while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
1775 +               kernel_fpu_begin();
1776                 aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
1777                                       nbytes & AES_BLOCK_MASK, walk.iv);
1778 +               kernel_fpu_end();
1779                 nbytes &= AES_BLOCK_SIZE - 1;
1780                 err = blkcipher_walk_done(desc, &walk, nbytes);
1781         }
1782         if (walk.nbytes) {
1783 +               kernel_fpu_begin();
1784                 ctr_crypt_final(ctx, &walk);
1785 +               kernel_fpu_end();
1786                 err = blkcipher_walk_done(desc, &walk, 0);
1787         }
1788 -       kernel_fpu_end();
1789
1790         return err;
1791  }
1792 diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
1793 index 8648158f3916..d7699130ee36 100644
1794 --- a/arch/x86/crypto/cast5_avx_glue.c
1795 +++ b/arch/x86/crypto/cast5_avx_glue.c
1796 @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled)
1797  static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1798                      bool enc)
1799  {
1800 -       bool fpu_enabled = false;
1801 +       bool fpu_enabled;
1802         struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
1803         const unsigned int bsize = CAST5_BLOCK_SIZE;
1804         unsigned int nbytes;
1805 @@ -75,7 +75,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1806                 u8 *wsrc = walk->src.virt.addr;
1807                 u8 *wdst = walk->dst.virt.addr;
1808
1809 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1810 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1811
1812                 /* Process multi-block batch */
1813                 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
1814 @@ -103,10 +103,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
1815                 } while (nbytes >= bsize);
1816
1817  done:
1818 +               cast5_fpu_end(fpu_enabled);
1819                 err = blkcipher_walk_done(desc, walk, nbytes);
1820         }
1821 -
1822 -       cast5_fpu_end(fpu_enabled);
1823         return err;
1824  }
1825
1826 @@ -227,7 +226,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
1827  static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1828                        struct scatterlist *src, unsigned int nbytes)
1829  {
1830 -       bool fpu_enabled = false;
1831 +       bool fpu_enabled;
1832         struct blkcipher_walk walk;
1833         int err;
1834
1835 @@ -236,12 +235,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1836         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1837
1838         while ((nbytes = walk.nbytes)) {
1839 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1840 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1841                 nbytes = __cbc_decrypt(desc, &walk);
1842 +               cast5_fpu_end(fpu_enabled);
1843                 err = blkcipher_walk_done(desc, &walk, nbytes);
1844         }
1845 -
1846 -       cast5_fpu_end(fpu_enabled);
1847         return err;
1848  }
1849
1850 @@ -311,7 +309,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
1851  static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1852                      struct scatterlist *src, unsigned int nbytes)
1853  {
1854 -       bool fpu_enabled = false;
1855 +       bool fpu_enabled;
1856         struct blkcipher_walk walk;
1857         int err;
1858
1859 @@ -320,13 +318,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1860         desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
1861
1862         while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
1863 -               fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes);
1864 +               fpu_enabled = cast5_fpu_begin(false, nbytes);
1865                 nbytes = __ctr_crypt(desc, &walk);
1866 +               cast5_fpu_end(fpu_enabled);
1867                 err = blkcipher_walk_done(desc, &walk, nbytes);
1868         }
1869
1870 -       cast5_fpu_end(fpu_enabled);
1871 -
1872         if (walk.nbytes) {
1873                 ctr_crypt_final(desc, &walk);
1874                 err = blkcipher_walk_done(desc, &walk, 0);
1875 diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
1876 index 6a85598931b5..3a506ce7ed93 100644
1877 --- a/arch/x86/crypto/glue_helper.c
1878 +++ b/arch/x86/crypto/glue_helper.c
1879 @@ -39,7 +39,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1880         void *ctx = crypto_blkcipher_ctx(desc->tfm);
1881         const unsigned int bsize = 128 / 8;
1882         unsigned int nbytes, i, func_bytes;
1883 -       bool fpu_enabled = false;
1884 +       bool fpu_enabled;
1885         int err;
1886
1887         err = blkcipher_walk_virt(desc, walk);
1888 @@ -49,7 +49,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1889                 u8 *wdst = walk->dst.virt.addr;
1890
1891                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1892 -                                            desc, fpu_enabled, nbytes);
1893 +                                            desc, false, nbytes);
1894
1895                 for (i = 0; i < gctx->num_funcs; i++) {
1896                         func_bytes = bsize * gctx->funcs[i].num_blocks;
1897 @@ -71,10 +71,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
1898                 }
1899
1900  done:
1901 +               glue_fpu_end(fpu_enabled);
1902                 err = blkcipher_walk_done(desc, walk, nbytes);
1903         }
1904
1905 -       glue_fpu_end(fpu_enabled);
1906         return err;
1907  }
1908
1909 @@ -194,7 +194,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1910                             struct scatterlist *src, unsigned int nbytes)
1911  {
1912         const unsigned int bsize = 128 / 8;
1913 -       bool fpu_enabled = false;
1914 +       bool fpu_enabled;
1915         struct blkcipher_walk walk;
1916         int err;
1917
1918 @@ -203,12 +203,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx,
1919
1920         while ((nbytes = walk.nbytes)) {
1921                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1922 -                                            desc, fpu_enabled, nbytes);
1923 +                                            desc, false, nbytes);
1924                 nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk);
1925 +               glue_fpu_end(fpu_enabled);
1926                 err = blkcipher_walk_done(desc, &walk, nbytes);
1927         }
1928
1929 -       glue_fpu_end(fpu_enabled);
1930         return err;
1931  }
1932  EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit);
1933 @@ -277,7 +277,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1934                           struct scatterlist *src, unsigned int nbytes)
1935  {
1936         const unsigned int bsize = 128 / 8;
1937 -       bool fpu_enabled = false;
1938 +       bool fpu_enabled;
1939         struct blkcipher_walk walk;
1940         int err;
1941
1942 @@ -286,13 +286,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
1943
1944         while ((nbytes = walk.nbytes) >= bsize) {
1945                 fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1946 -                                            desc, fpu_enabled, nbytes);
1947 +                                            desc, false, nbytes);
1948                 nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk);
1949 +               glue_fpu_end(fpu_enabled);
1950                 err = blkcipher_walk_done(desc, &walk, nbytes);
1951         }
1952
1953 -       glue_fpu_end(fpu_enabled);
1954 -
1955         if (walk.nbytes) {
1956                 glue_ctr_crypt_final_128bit(
1957                         gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk);
1958 @@ -347,7 +346,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1959                           void *tweak_ctx, void *crypt_ctx)
1960  {
1961         const unsigned int bsize = 128 / 8;
1962 -       bool fpu_enabled = false;
1963 +       bool fpu_enabled;
1964         struct blkcipher_walk walk;
1965         int err;
1966
1967 @@ -360,21 +359,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
1968
1969         /* set minimum length to bsize, for tweak_fn */
1970         fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1971 -                                    desc, fpu_enabled,
1972 +                                    desc, false,
1973                                      nbytes < bsize ? bsize : nbytes);
1974 -
1975         /* calculate first value of T */
1976         tweak_fn(tweak_ctx, walk.iv, walk.iv);
1977 +       glue_fpu_end(fpu_enabled);
1978
1979         while (nbytes) {
1980 +               fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
1981 +                               desc, false, nbytes);
1982                 nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
1983
1984 +               glue_fpu_end(fpu_enabled);
1985                 err = blkcipher_walk_done(desc, &walk, nbytes);
1986                 nbytes = walk.nbytes;
1987         }
1988 -
1989 -       glue_fpu_end(fpu_enabled);
1990 -
1991         return err;
1992  }
1993  EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
1994 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
1995 index 1433f6b4607d..f963fde8e4fa 100644
1996 --- a/arch/x86/entry/common.c
1997 +++ b/arch/x86/entry/common.c
1998 @@ -136,7 +136,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
1999
2000  #define EXIT_TO_USERMODE_LOOP_FLAGS                            \
2001         (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
2002 -        _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
2003 +        _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY)
2004
2005  static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2006  {
2007 @@ -152,9 +152,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
2008                 /* We have work to do. */
2009                 local_irq_enable();
2010
2011 -               if (cached_flags & _TIF_NEED_RESCHED)
2012 +               if (cached_flags & _TIF_NEED_RESCHED_MASK)
2013                         schedule();
2014
2015 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
2016 +               if (unlikely(current->forced_info.si_signo)) {
2017 +                       struct task_struct *t = current;
2018 +                       force_sig_info(t->forced_info.si_signo, &t->forced_info, t);
2019 +                       t->forced_info.si_signo = 0;
2020 +               }
2021 +#endif
2022                 if (cached_flags & _TIF_UPROBE)
2023                         uprobe_notify_resume(regs);
2024
2025 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
2026 index 0b56666e6039..1d8ee026c9c5 100644
2027 --- a/arch/x86/entry/entry_32.S
2028 +++ b/arch/x86/entry/entry_32.S
2029 @@ -271,8 +271,25 @@ END(ret_from_exception)
2030  ENTRY(resume_kernel)
2031         DISABLE_INTERRUPTS(CLBR_ANY)
2032  need_resched:
2033 +       # preempt count == 0 + NEED_RS set?
2034         cmpl    $0, PER_CPU_VAR(__preempt_count)
2035 +#ifndef CONFIG_PREEMPT_LAZY
2036         jnz     restore_all
2037 +#else
2038 +       jz test_int_off
2039 +
2040 +       # atleast preempt count == 0 ?
2041 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2042 +       jne restore_all
2043 +
2044 +       GET_THREAD_INFO(%ebp)
2045 +       cmpl $0,TI_preempt_lazy_count(%ebp)     # non-zero preempt_lazy_count ?
2046 +       jnz restore_all
2047 +
2048 +       testl $_TIF_NEED_RESCHED_LAZY, TI_flags(%ebp)
2049 +       jz restore_all
2050 +test_int_off:
2051 +#endif
2052         testl   $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
2053         jz      restore_all
2054         call    preempt_schedule_irq
2055 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
2056 index 02fff3ebfb87..81ec3d016df0 100644
2057 --- a/arch/x86/entry/entry_64.S
2058 +++ b/arch/x86/entry/entry_64.S
2059 @@ -512,7 +512,23 @@ GLOBAL(retint_user)
2060         bt      $9, EFLAGS(%rsp)                /* were interrupts off? */
2061         jnc     1f
2062  0:     cmpl    $0, PER_CPU_VAR(__preempt_count)
2063 +#ifndef CONFIG_PREEMPT_LAZY
2064         jnz     1f
2065 +#else
2066 +       jz      do_preempt_schedule_irq
2067 +
2068 +       # atleast preempt count == 0 ?
2069 +       cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count)
2070 +       jnz     1f
2071 +
2072 +       GET_THREAD_INFO(%rcx)
2073 +       cmpl    $0, TI_preempt_lazy_count(%rcx)
2074 +       jnz     1f
2075 +
2076 +       bt      $TIF_NEED_RESCHED_LAZY,TI_flags(%rcx)
2077 +       jnc     1f
2078 +do_preempt_schedule_irq:
2079 +#endif
2080         call    preempt_schedule_irq
2081         jmp     0b
2082  1:
2083 @@ -817,6 +833,7 @@ END(native_load_gs_index)
2084         jmp     2b
2085         .previous
2086
2087 +#ifndef CONFIG_PREEMPT_RT_FULL
2088  /* Call softirq on interrupt stack. Interrupts are off. */
2089  ENTRY(do_softirq_own_stack)
2090         pushq   %rbp
2091 @@ -829,6 +846,7 @@ ENTRY(do_softirq_own_stack)
2092         decl    PER_CPU_VAR(irq_count)
2093         ret
2094  END(do_softirq_own_stack)
2095 +#endif
2096
2097  #ifdef CONFIG_XEN
2098  idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
2099 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
2100 index 17f218645701..11bd1b7ee6eb 100644
2101 --- a/arch/x86/include/asm/preempt.h
2102 +++ b/arch/x86/include/asm/preempt.h
2103 @@ -79,17 +79,46 @@ static __always_inline void __preempt_count_sub(int val)
2104   * a decrement which hits zero means we have no preempt_count and should
2105   * reschedule.
2106   */
2107 -static __always_inline bool __preempt_count_dec_and_test(void)
2108 +static __always_inline bool ____preempt_count_dec_and_test(void)
2109  {
2110         GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e);
2111  }
2112
2113 +static __always_inline bool __preempt_count_dec_and_test(void)
2114 +{
2115 +       if (____preempt_count_dec_and_test())
2116 +               return true;
2117 +#ifdef CONFIG_PREEMPT_LAZY
2118 +       if (current_thread_info()->preempt_lazy_count)
2119 +               return false;
2120 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2121 +#else
2122 +       return false;
2123 +#endif
2124 +}
2125 +
2126  /*
2127   * Returns true when we need to resched and can (barring IRQ state).
2128   */
2129  static __always_inline bool should_resched(int preempt_offset)
2130  {
2131 +#ifdef CONFIG_PREEMPT_LAZY
2132 +       u32 tmp;
2133 +
2134 +       tmp = raw_cpu_read_4(__preempt_count);
2135 +       if (tmp == preempt_offset)
2136 +               return true;
2137 +
2138 +       /* preempt count == 0 ? */
2139 +       tmp &= ~PREEMPT_NEED_RESCHED;
2140 +       if (tmp)
2141 +               return false;
2142 +       if (current_thread_info()->preempt_lazy_count)
2143 +               return false;
2144 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
2145 +#else
2146         return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
2147 +#endif
2148  }
2149
2150  #ifdef CONFIG_PREEMPT
2151 diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
2152 index dd1e7d6387ab..d59bedb28bab 100644
2153 --- a/arch/x86/include/asm/signal.h
2154 +++ b/arch/x86/include/asm/signal.h
2155 @@ -23,6 +23,19 @@ typedef struct {
2156         unsigned long sig[_NSIG_WORDS];
2157  } sigset_t;
2158
2159 +/*
2160 + * Because some traps use the IST stack, we must keep preemption
2161 + * disabled while calling do_trap(), but do_trap() may call
2162 + * force_sig_info() which will grab the signal spin_locks for the
2163 + * task, which in PREEMPT_RT_FULL are mutexes.  By defining
2164 + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set
2165 + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the
2166 + * trap.
2167 + */
2168 +#if defined(CONFIG_PREEMPT_RT_FULL)
2169 +#define ARCH_RT_DELAYS_SIGNAL_SEND
2170 +#endif
2171 +
2172  #ifndef CONFIG_COMPAT
2173  typedef sigset_t compat_sigset_t;
2174  #endif
2175 diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
2176 index 58505f01962f..02fa39652cd6 100644
2177 --- a/arch/x86/include/asm/stackprotector.h
2178 +++ b/arch/x86/include/asm/stackprotector.h
2179 @@ -59,7 +59,7 @@
2180   */
2181  static __always_inline void boot_init_stack_canary(void)
2182  {
2183 -       u64 canary;
2184 +       u64 uninitialized_var(canary);
2185         u64 tsc;
2186
2187  #ifdef CONFIG_X86_64
2188 @@ -70,8 +70,15 @@ static __always_inline void boot_init_stack_canary(void)
2189          * of randomness. The TSC only matters for very early init,
2190          * there it already has some randomness on most systems. Later
2191          * on during the bootup the random pool has true entropy too.
2192 +        *
2193 +        * For preempt-rt we need to weaken the randomness a bit, as
2194 +        * we can't call into the random generator from atomic context
2195 +        * due to locking constraints. We just leave canary
2196 +        * uninitialized and use the TSC based randomness on top of it.
2197          */
2198 +#ifndef CONFIG_PREEMPT_RT_FULL
2199         get_random_bytes(&canary, sizeof(canary));
2200 +#endif
2201         tsc = rdtsc();
2202         canary += tsc + (tsc << 32UL);
2203
2204 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2205 index 8b7c8d8e0852..631059ef61da 100644
2206 --- a/arch/x86/include/asm/thread_info.h
2207 +++ b/arch/x86/include/asm/thread_info.h
2208 @@ -57,6 +57,8 @@ struct thread_info {
2209         __u32                   flags;          /* low level flags */
2210         __u32                   status;         /* thread synchronous flags */
2211         __u32                   cpu;            /* current CPU */
2212 +       int                     preempt_lazy_count;     /* 0 => lazy preemptable
2213 +                                                          <0 => BUG */
2214  };
2215
2216  #define INIT_THREAD_INFO(tsk)                  \
2217 @@ -73,6 +75,10 @@ struct thread_info {
2218
2219  #include <asm/asm-offsets.h>
2220
2221 +#define GET_THREAD_INFO(reg) \
2222 +       _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
2223 +       _ASM_SUB $(THREAD_SIZE),reg ;
2224 +
2225  #endif
2226
2227  /*
2228 @@ -91,6 +97,7 @@ struct thread_info {
2229  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
2230  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
2231  #define TIF_SECCOMP            8       /* secure computing */
2232 +#define TIF_NEED_RESCHED_LAZY  9       /* lazy rescheduling necessary */
2233  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
2234  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
2235  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
2236 @@ -115,6 +122,7 @@ struct thread_info {
2237  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
2238  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
2239  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
2240 +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
2241  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
2242  #define _TIF_UPROBE            (1 << TIF_UPROBE)
2243  #define _TIF_NOTSC             (1 << TIF_NOTSC)
2244 @@ -151,6 +159,8 @@ struct thread_info {
2245  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2246  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2247
2248 +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
2249 +
2250  #define STACK_WARN             (THREAD_SIZE/8)
2251
2252  /*
2253 diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
2254 index cc44d926c17e..df278aa0f638 100644
2255 --- a/arch/x86/include/asm/uv/uv_bau.h
2256 +++ b/arch/x86/include/asm/uv/uv_bau.h
2257 @@ -615,9 +615,9 @@ struct bau_control {
2258         cycles_t                send_message;
2259         cycles_t                period_end;
2260         cycles_t                period_time;
2261 -       spinlock_t              uvhub_lock;
2262 -       spinlock_t              queue_lock;
2263 -       spinlock_t              disable_lock;
2264 +       raw_spinlock_t          uvhub_lock;
2265 +       raw_spinlock_t          queue_lock;
2266 +       raw_spinlock_t          disable_lock;
2267         /* tunables */
2268         int                     max_concurr;
2269         int                     max_concurr_const;
2270 @@ -776,15 +776,15 @@ static inline int atom_asr(short i, struct atomic_short *v)
2271   * to be lowered below the current 'v'.  atomic_add_unless can only stop
2272   * on equal.
2273   */
2274 -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
2275 +static inline int atomic_inc_unless_ge(raw_spinlock_t *lock, atomic_t *v, int u)
2276  {
2277 -       spin_lock(lock);
2278 +       raw_spin_lock(lock);
2279         if (atomic_read(v) >= u) {
2280 -               spin_unlock(lock);
2281 +               raw_spin_unlock(lock);
2282                 return 0;
2283         }
2284         atomic_inc(v);
2285 -       spin_unlock(lock);
2286 +       raw_spin_unlock(lock);
2287         return 1;
2288  }
2289
2290 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
2291 index fbd19444403f..e78f477a4ae3 100644
2292 --- a/arch/x86/kernel/acpi/boot.c
2293 +++ b/arch/x86/kernel/acpi/boot.c
2294 @@ -87,7 +87,9 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
2295   *             ->ioapic_mutex
2296   *                     ->ioapic_lock
2297   */
2298 +#ifdef CONFIG_X86_IO_APIC
2299  static DEFINE_MUTEX(acpi_ioapic_lock);
2300 +#endif
2301
2302  /* --------------------------------------------------------------------------
2303                                Boot-time Configuration
2304 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
2305 index 48e6d84f173e..0b5a8b994f65 100644
2306 --- a/arch/x86/kernel/apic/io_apic.c
2307 +++ b/arch/x86/kernel/apic/io_apic.c
2308 @@ -1712,7 +1712,8 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
2309  static inline bool ioapic_irqd_mask(struct irq_data *data)
2310  {
2311         /* If we are moving the irq we need to mask it */
2312 -       if (unlikely(irqd_is_setaffinity_pending(data))) {
2313 +       if (unlikely(irqd_is_setaffinity_pending(data) &&
2314 +                    !irqd_irq_inprogress(data))) {
2315                 mask_ioapic_irq(data);
2316                 return true;
2317         }
2318 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
2319 index 2bd5c6ff7ee7..a2c317f5839b 100644
2320 --- a/arch/x86/kernel/asm-offsets.c
2321 +++ b/arch/x86/kernel/asm-offsets.c
2322 @@ -31,6 +31,7 @@ void common(void) {
2323         BLANK();
2324         OFFSET(TI_flags, thread_info, flags);
2325         OFFSET(TI_status, thread_info, status);
2326 +       OFFSET(TI_preempt_lazy_count, thread_info, preempt_lazy_count);
2327
2328         BLANK();
2329         OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
2330 @@ -88,4 +89,5 @@ void common(void) {
2331
2332         BLANK();
2333         DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
2334 +       DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED);
2335  }
2336 diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
2337 index 79d8ec849468..accbf0e806d0 100644
2338 --- a/arch/x86/kernel/cpu/mcheck/mce.c
2339 +++ b/arch/x86/kernel/cpu/mcheck/mce.c
2340 @@ -41,6 +41,8 @@
2341  #include <linux/debugfs.h>
2342  #include <linux/irq_work.h>
2343  #include <linux/export.h>
2344 +#include <linux/jiffies.h>
2345 +#include <linux/swork.h>
2346
2347  #include <asm/processor.h>
2348  #include <asm/traps.h>
2349 @@ -1291,7 +1293,7 @@ void mce_log_therm_throt_event(__u64 status)
2350  static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
2351
2352  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
2353 -static DEFINE_PER_CPU(struct timer_list, mce_timer);
2354 +static DEFINE_PER_CPU(struct hrtimer, mce_timer);
2355
2356  static unsigned long mce_adjust_timer_default(unsigned long interval)
2357  {
2358 @@ -1300,32 +1302,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
2359
2360  static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
2361
2362 -static void __restart_timer(struct timer_list *t, unsigned long interval)
2363 +static enum hrtimer_restart __restart_timer(struct hrtimer *timer, unsigned long interval)
2364  {
2365 -       unsigned long when = jiffies + interval;
2366 -       unsigned long flags;
2367 -
2368 -       local_irq_save(flags);
2369 -
2370 -       if (timer_pending(t)) {
2371 -               if (time_before(when, t->expires))
2372 -                       mod_timer(t, when);
2373 -       } else {
2374 -               t->expires = round_jiffies(when);
2375 -               add_timer_on(t, smp_processor_id());
2376 -       }
2377 -
2378 -       local_irq_restore(flags);
2379 +       if (!interval)
2380 +               return HRTIMER_NORESTART;
2381 +       hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(interval)));
2382 +       return HRTIMER_RESTART;
2383  }
2384
2385 -static void mce_timer_fn(unsigned long data)
2386 +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer)
2387  {
2388 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2389 -       int cpu = smp_processor_id();
2390         unsigned long iv;
2391
2392 -       WARN_ON(cpu != data);
2393 -
2394         iv = __this_cpu_read(mce_next_interval);
2395
2396         if (mce_available(this_cpu_ptr(&cpu_info))) {
2397 @@ -1348,7 +1336,7 @@ static void mce_timer_fn(unsigned long data)
2398
2399  done:
2400         __this_cpu_write(mce_next_interval, iv);
2401 -       __restart_timer(t, iv);
2402 +       return __restart_timer(timer, iv);
2403  }
2404
2405  /*
2406 @@ -1356,7 +1344,7 @@ static void mce_timer_fn(unsigned long data)
2407   */
2408  void mce_timer_kick(unsigned long interval)
2409  {
2410 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2411 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2412         unsigned long iv = __this_cpu_read(mce_next_interval);
2413
2414         __restart_timer(t, interval);
2415 @@ -1371,7 +1359,7 @@ static void mce_timer_delete_all(void)
2416         int cpu;
2417
2418         for_each_online_cpu(cpu)
2419 -               del_timer_sync(&per_cpu(mce_timer, cpu));
2420 +               hrtimer_cancel(&per_cpu(mce_timer, cpu));
2421  }
2422
2423  static void mce_do_trigger(struct work_struct *work)
2424 @@ -1381,6 +1369,56 @@ static void mce_do_trigger(struct work_struct *work)
2425
2426  static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2427
2428 +static void __mce_notify_work(struct swork_event *event)
2429 +{
2430 +       /* Not more than two messages every minute */
2431 +       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2432 +
2433 +       /* wake processes polling /dev/mcelog */
2434 +       wake_up_interruptible(&mce_chrdev_wait);
2435 +
2436 +       /*
2437 +        * There is no risk of missing notifications because
2438 +        * work_pending is always cleared before the function is
2439 +        * executed.
2440 +        */
2441 +       if (mce_helper[0] && !work_pending(&mce_trigger_work))
2442 +               schedule_work(&mce_trigger_work);
2443 +
2444 +       if (__ratelimit(&ratelimit))
2445 +               pr_info(HW_ERR "Machine check events logged\n");
2446 +}
2447 +
2448 +#ifdef CONFIG_PREEMPT_RT_FULL
2449 +static bool notify_work_ready __read_mostly;
2450 +static struct swork_event notify_work;
2451 +
2452 +static int mce_notify_work_init(void)
2453 +{
2454 +       int err;
2455 +
2456 +       err = swork_get();
2457 +       if (err)
2458 +               return err;
2459 +
2460 +       INIT_SWORK(&notify_work, __mce_notify_work);
2461 +       notify_work_ready = true;
2462 +       return 0;
2463 +}
2464 +
2465 +static void mce_notify_work(void)
2466 +{
2467 +       if (notify_work_ready)
2468 +               swork_queue(&notify_work);
2469 +}
2470 +#else
2471 +static void mce_notify_work(void)
2472 +{
2473 +       __mce_notify_work(NULL);
2474 +}
2475 +static inline int mce_notify_work_init(void) { return 0; }
2476 +#endif
2477 +
2478  /*
2479   * Notify the user(s) about new machine check events.
2480   * Can be called from interrupt context, but not from machine check/NMI
2481 @@ -1388,19 +1426,8 @@ static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
2482   */
2483  int mce_notify_irq(void)
2484  {
2485 -       /* Not more than two messages every minute */
2486 -       static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
2487 -
2488         if (test_and_clear_bit(0, &mce_need_notify)) {
2489 -               /* wake processes polling /dev/mcelog */
2490 -               wake_up_interruptible(&mce_chrdev_wait);
2491 -
2492 -               if (mce_helper[0])
2493 -                       schedule_work(&mce_trigger_work);
2494 -
2495 -               if (__ratelimit(&ratelimit))
2496 -                       pr_info(HW_ERR "Machine check events logged\n");
2497 -
2498 +               mce_notify_work();
2499                 return 1;
2500         }
2501         return 0;
2502 @@ -1717,7 +1744,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
2503         }
2504  }
2505
2506 -static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2507 +static void mce_start_timer(unsigned int cpu, struct hrtimer *t)
2508  {
2509         unsigned long iv = check_interval * HZ;
2510
2511 @@ -1726,16 +1753,17 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
2512
2513         per_cpu(mce_next_interval, cpu) = iv;
2514
2515 -       t->expires = round_jiffies(jiffies + iv);
2516 -       add_timer_on(t, cpu);
2517 +       hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL),
2518 +                       0, HRTIMER_MODE_REL_PINNED);
2519  }
2520
2521  static void __mcheck_cpu_init_timer(void)
2522  {
2523 -       struct timer_list *t = this_cpu_ptr(&mce_timer);
2524 +       struct hrtimer *t = this_cpu_ptr(&mce_timer);
2525         unsigned int cpu = smp_processor_id();
2526
2527 -       setup_pinned_timer(t, mce_timer_fn, cpu);
2528 +       hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2529 +       t->function = mce_timer_fn;
2530         mce_start_timer(cpu, t);
2531  }
2532
2533 @@ -2459,6 +2487,8 @@ static void mce_disable_cpu(void *h)
2534         if (!mce_available(raw_cpu_ptr(&cpu_info)))
2535                 return;
2536
2537 +       hrtimer_cancel(this_cpu_ptr(&mce_timer));
2538 +
2539         if (!(action & CPU_TASKS_FROZEN))
2540                 cmci_clear();
2541
2542 @@ -2481,6 +2511,7 @@ static void mce_reenable_cpu(void *h)
2543                 if (b->init)
2544                         wrmsrl(msr_ops.ctl(i), b->ctl);
2545         }
2546 +       __mcheck_cpu_init_timer();
2547  }
2548
2549  /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2550 @@ -2488,7 +2519,6 @@ static int
2551  mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2552  {
2553         unsigned int cpu = (unsigned long)hcpu;
2554 -       struct timer_list *t = &per_cpu(mce_timer, cpu);
2555
2556         switch (action & ~CPU_TASKS_FROZEN) {
2557         case CPU_ONLINE:
2558 @@ -2508,11 +2538,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2559                 break;
2560         case CPU_DOWN_PREPARE:
2561                 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2562 -               del_timer_sync(t);
2563                 break;
2564         case CPU_DOWN_FAILED:
2565                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2566 -               mce_start_timer(cpu, t);
2567                 break;
2568         }
2569
2570 @@ -2551,6 +2579,10 @@ static __init int mcheck_init_device(void)
2571                 goto err_out;
2572         }
2573
2574 +       err = mce_notify_work_init();
2575 +       if (err)
2576 +               goto err_out;
2577 +
2578         if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2579                 err = -ENOMEM;
2580                 goto err_out;
2581 diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
2582 index 09675712eba8..eea7557b355d 100644
2583 --- a/arch/x86/kernel/dumpstack_32.c
2584 +++ b/arch/x86/kernel/dumpstack_32.c
2585 @@ -42,7 +42,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
2586                 unsigned long *stack, unsigned long bp,
2587                 const struct stacktrace_ops *ops, void *data)
2588  {
2589 -       const unsigned cpu = get_cpu();
2590 +       const unsigned cpu = get_cpu_light();
2591         int graph = 0;
2592         u32 *prev_esp;
2593
2594 @@ -84,7 +84,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
2595                         break;
2596                 touch_nmi_watchdog();
2597         }
2598 -       put_cpu();
2599 +       put_cpu_light();
2600  }
2601  EXPORT_SYMBOL(dump_trace);
2602
2603 diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
2604 index 9ee4520ce83c..2cd610b68868 100644
2605 --- a/arch/x86/kernel/dumpstack_64.c
2606 +++ b/arch/x86/kernel/dumpstack_64.c
2607 @@ -152,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
2608                 unsigned long *stack, unsigned long bp,
2609                 const struct stacktrace_ops *ops, void *data)
2610  {
2611 -       const unsigned cpu = get_cpu();
2612 +       const unsigned cpu = get_cpu_light();
2613         unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
2614         unsigned long dummy;
2615         unsigned used = 0;
2616 @@ -239,7 +239,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
2617          * This handles the process stack:
2618          */
2619         bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph);
2620 -       put_cpu();
2621 +       put_cpu_light();
2622  }
2623  EXPORT_SYMBOL(dump_trace);
2624
2625 @@ -253,7 +253,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
2626         int cpu;
2627         int i;
2628
2629 -       preempt_disable();
2630 +       migrate_disable();
2631         cpu = smp_processor_id();
2632
2633         irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
2634 @@ -299,7 +299,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
2635                 stack++;
2636                 touch_nmi_watchdog();
2637         }
2638 -       preempt_enable();
2639 +       migrate_enable();
2640
2641         pr_cont("\n");
2642         show_trace_log_lvl(task, regs, sp, bp, log_lvl);
2643 diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2644 index 1f38d9a4d9de..053bf3b2ef39 100644
2645 --- a/arch/x86/kernel/irq_32.c
2646 +++ b/arch/x86/kernel/irq_32.c
2647 @@ -127,6 +127,7 @@ void irq_ctx_init(int cpu)
2648                cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));
2649  }
2650
2651 +#ifndef CONFIG_PREEMPT_RT_FULL
2652  void do_softirq_own_stack(void)
2653  {
2654         struct irq_stack *irqstk;
2655 @@ -143,6 +144,7 @@ void do_softirq_own_stack(void)
2656
2657         call_on_stack(__do_softirq, isp);
2658  }
2659 +#endif
2660
2661  bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
2662  {
2663 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
2664 index d86be29c38c7..b0e29d1a0571 100644
2665 --- a/arch/x86/kernel/process_32.c
2666 +++ b/arch/x86/kernel/process_32.c
2667 @@ -35,6 +35,7 @@
2668  #include <linux/uaccess.h>
2669  #include <linux/io.h>
2670  #include <linux/kdebug.h>
2671 +#include <linux/highmem.h>
2672
2673  #include <asm/pgtable.h>
2674  #include <asm/ldt.h>
2675 @@ -210,6 +211,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
2676  }
2677  EXPORT_SYMBOL_GPL(start_thread);
2678
2679 +#ifdef CONFIG_PREEMPT_RT_FULL
2680 +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p)
2681 +{
2682 +       int i;
2683 +
2684 +       /*
2685 +        * Clear @prev's kmap_atomic mappings
2686 +        */
2687 +       for (i = 0; i < prev_p->kmap_idx; i++) {
2688 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2689 +               pte_t *ptep = kmap_pte - idx;
2690 +
2691 +               kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx));
2692 +       }
2693 +       /*
2694 +        * Restore @next_p's kmap_atomic mappings
2695 +        */
2696 +       for (i = 0; i < next_p->kmap_idx; i++) {
2697 +               int idx = i + KM_TYPE_NR * smp_processor_id();
2698 +
2699 +               if (!pte_none(next_p->kmap_pte[i]))
2700 +                       set_pte(kmap_pte - idx, next_p->kmap_pte[i]);
2701 +       }
2702 +}
2703 +#else
2704 +static inline void
2705 +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { }
2706 +#endif
2707 +
2708
2709  /*
2710   *     switch_to(x,y) should switch tasks from x to y.
2711 @@ -286,6 +316,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
2712                      task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
2713                 __switch_to_xtra(prev_p, next_p, tss);
2714
2715 +       switch_kmaps(prev_p, next_p);
2716 +
2717         /*
2718          * Leave lazy mode, flushing any hypercalls made here.
2719          * This must be done before restoring TLS segments so
2720 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
2721 index b62c85229711..d907b281a9d6 100644
2722 --- a/arch/x86/kvm/lapic.c
2723 +++ b/arch/x86/kvm/lapic.c
2724 @@ -1938,6 +1938,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
2725         hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
2726                      HRTIMER_MODE_ABS_PINNED);
2727         apic->lapic_timer.timer.function = apic_timer_fn;
2728 +       apic->lapic_timer.timer.irqsafe = 1;
2729
2730         /*
2731          * APIC is created enabled. This will prevent kvm_lapic_set_base from
2732 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2733 index 699f8726539a..24f30c86510c 100644
2734 --- a/arch/x86/kvm/x86.c
2735 +++ b/arch/x86/kvm/x86.c
2736 @@ -5865,6 +5865,13 @@ int kvm_arch_init(void *opaque)
2737                 goto out;
2738         }
2739
2740 +#ifdef CONFIG_PREEMPT_RT_FULL
2741 +       if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2742 +               printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n");
2743 +               return -EOPNOTSUPP;
2744 +       }
2745 +#endif
2746 +
2747         r = kvm_mmu_module_init();
2748         if (r)
2749                 goto out_free_percpu;
2750 diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
2751 index 6d18b70ed5a9..f752724c22e8 100644
2752 --- a/arch/x86/mm/highmem_32.c
2753 +++ b/arch/x86/mm/highmem_32.c
2754 @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap);
2755   */
2756  void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2757  {
2758 +       pte_t pte = mk_pte(page, prot);
2759         unsigned long vaddr;
2760         int idx, type;
2761
2762 -       preempt_disable();
2763 +       preempt_disable_nort();
2764         pagefault_disable();
2765
2766         if (!PageHighMem(page))
2767 @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
2768         idx = type + KM_TYPE_NR*smp_processor_id();
2769         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2770         BUG_ON(!pte_none(*(kmap_pte-idx)));
2771 -       set_pte(kmap_pte-idx, mk_pte(page, prot));
2772 +#ifdef CONFIG_PREEMPT_RT_FULL
2773 +       current->kmap_pte[type] = pte;
2774 +#endif
2775 +       set_pte(kmap_pte-idx, pte);
2776         arch_flush_lazy_mmu_mode();
2777
2778         return (void *)vaddr;
2779 @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr)
2780                  * is a bad idea also, in case the page changes cacheability
2781                  * attributes or becomes a protected page in a hypervisor.
2782                  */
2783 +#ifdef CONFIG_PREEMPT_RT_FULL
2784 +               current->kmap_pte[type] = __pte(0);
2785 +#endif
2786                 kpte_clear_flush(kmap_pte-idx, vaddr);
2787                 kmap_atomic_idx_pop();
2788                 arch_flush_lazy_mmu_mode();
2789 @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr)
2790  #endif
2791
2792         pagefault_enable();
2793 -       preempt_enable();
2794 +       preempt_enable_nort();
2795  }
2796  EXPORT_SYMBOL(__kunmap_atomic);
2797
2798 diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
2799 index ada98b39b8ad..585f6829653b 100644
2800 --- a/arch/x86/mm/iomap_32.c
2801 +++ b/arch/x86/mm/iomap_32.c
2802 @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free);
2803
2804  void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2805  {
2806 +       pte_t pte = pfn_pte(pfn, prot);
2807         unsigned long vaddr;
2808         int idx, type;
2809
2810 @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
2811         type = kmap_atomic_idx_push();
2812         idx = type + KM_TYPE_NR * smp_processor_id();
2813         vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
2814 -       set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
2815 +       WARN_ON(!pte_none(*(kmap_pte - idx)));
2816 +
2817 +#ifdef CONFIG_PREEMPT_RT_FULL
2818 +       current->kmap_pte[type] = pte;
2819 +#endif
2820 +       set_pte(kmap_pte - idx, pte);
2821         arch_flush_lazy_mmu_mode();
2822
2823         return (void *)vaddr;
2824 @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr)
2825                  * is a bad idea also, in case the page changes cacheability
2826                  * attributes or becomes a protected page in a hypervisor.
2827                  */
2828 +#ifdef CONFIG_PREEMPT_RT_FULL
2829 +               current->kmap_pte[type] = __pte(0);
2830 +#endif
2831                 kpte_clear_flush(kmap_pte-idx, vaddr);
2832                 kmap_atomic_idx_pop();
2833         }
2834 diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
2835 index fdb4d42b4ce5..8ab90fbecff0 100644
2836 --- a/arch/x86/platform/uv/tlb_uv.c
2837 +++ b/arch/x86/platform/uv/tlb_uv.c
2838 @@ -729,9 +729,9 @@ static void destination_plugged(struct bau_desc *bau_desc,
2839
2840                 quiesce_local_uvhub(hmaster);
2841
2842 -               spin_lock(&hmaster->queue_lock);
2843 +               raw_spin_lock(&hmaster->queue_lock);
2844                 reset_with_ipi(&bau_desc->distribution, bcp);
2845 -               spin_unlock(&hmaster->queue_lock);
2846 +               raw_spin_unlock(&hmaster->queue_lock);
2847
2848                 end_uvhub_quiesce(hmaster);
2849
2850 @@ -751,9 +751,9 @@ static void destination_timeout(struct bau_desc *bau_desc,
2851
2852                 quiesce_local_uvhub(hmaster);
2853
2854 -               spin_lock(&hmaster->queue_lock);
2855 +               raw_spin_lock(&hmaster->queue_lock);
2856                 reset_with_ipi(&bau_desc->distribution, bcp);
2857 -               spin_unlock(&hmaster->queue_lock);
2858 +               raw_spin_unlock(&hmaster->queue_lock);
2859
2860                 end_uvhub_quiesce(hmaster);
2861
2862 @@ -774,7 +774,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2863         cycles_t tm1;
2864
2865         hmaster = bcp->uvhub_master;
2866 -       spin_lock(&hmaster->disable_lock);
2867 +       raw_spin_lock(&hmaster->disable_lock);
2868         if (!bcp->baudisabled) {
2869                 stat->s_bau_disabled++;
2870                 tm1 = get_cycles();
2871 @@ -787,7 +787,7 @@ static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
2872                         }
2873                 }
2874         }
2875 -       spin_unlock(&hmaster->disable_lock);
2876 +       raw_spin_unlock(&hmaster->disable_lock);
2877  }
2878
2879  static void count_max_concurr(int stat, struct bau_control *bcp,
2880 @@ -850,7 +850,7 @@ static void record_send_stats(cycles_t time1, cycles_t time2,
2881   */
2882  static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
2883  {
2884 -       spinlock_t *lock = &hmaster->uvhub_lock;
2885 +       raw_spinlock_t *lock = &hmaster->uvhub_lock;
2886         atomic_t *v;
2887
2888         v = &hmaster->active_descriptor_count;
2889 @@ -983,7 +983,7 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2890         struct bau_control *hmaster;
2891
2892         hmaster = bcp->uvhub_master;
2893 -       spin_lock(&hmaster->disable_lock);
2894 +       raw_spin_lock(&hmaster->disable_lock);
2895         if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
2896                 stat->s_bau_reenabled++;
2897                 for_each_present_cpu(tcpu) {
2898 @@ -995,10 +995,10 @@ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
2899                                 tbcp->period_giveups = 0;
2900                         }
2901                 }
2902 -               spin_unlock(&hmaster->disable_lock);
2903 +               raw_spin_unlock(&hmaster->disable_lock);
2904                 return 0;
2905         }
2906 -       spin_unlock(&hmaster->disable_lock);
2907 +       raw_spin_unlock(&hmaster->disable_lock);
2908         return -1;
2909  }
2910
2911 @@ -1916,9 +1916,9 @@ static void __init init_per_cpu_tunables(void)
2912                 bcp->cong_reps                  = congested_reps;
2913                 bcp->disabled_period =          sec_2_cycles(disabled_period);
2914                 bcp->giveup_limit =             giveup_limit;
2915 -               spin_lock_init(&bcp->queue_lock);
2916 -               spin_lock_init(&bcp->uvhub_lock);
2917 -               spin_lock_init(&bcp->disable_lock);
2918 +               raw_spin_lock_init(&bcp->queue_lock);
2919 +               raw_spin_lock_init(&bcp->uvhub_lock);
2920 +               raw_spin_lock_init(&bcp->disable_lock);
2921         }
2922  }
2923
2924 diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
2925 index b333fc45f9ec..8b85916e6986 100644
2926 --- a/arch/x86/platform/uv/uv_time.c
2927 +++ b/arch/x86/platform/uv/uv_time.c
2928 @@ -57,7 +57,7 @@ static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
2929
2930  /* There is one of these allocated per node */
2931  struct uv_rtc_timer_head {
2932 -       spinlock_t      lock;
2933 +       raw_spinlock_t  lock;
2934         /* next cpu waiting for timer, local node relative: */
2935         int             next_cpu;
2936         /* number of cpus on this node: */
2937 @@ -177,7 +177,7 @@ static __init int uv_rtc_allocate_timers(void)
2938                                 uv_rtc_deallocate_timers();
2939                                 return -ENOMEM;
2940                         }
2941 -                       spin_lock_init(&head->lock);
2942 +                       raw_spin_lock_init(&head->lock);
2943                         head->ncpus = uv_blade_nr_possible_cpus(bid);
2944                         head->next_cpu = -1;
2945                         blade_info[bid] = head;
2946 @@ -231,7 +231,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2947         unsigned long flags;
2948         int next_cpu;
2949
2950 -       spin_lock_irqsave(&head->lock, flags);
2951 +       raw_spin_lock_irqsave(&head->lock, flags);
2952
2953         next_cpu = head->next_cpu;
2954         *t = expires;
2955 @@ -243,12 +243,12 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
2956                 if (uv_setup_intr(cpu, expires)) {
2957                         *t = ULLONG_MAX;
2958                         uv_rtc_find_next_timer(head, pnode);
2959 -                       spin_unlock_irqrestore(&head->lock, flags);
2960 +                       raw_spin_unlock_irqrestore(&head->lock, flags);
2961                         return -ETIME;
2962                 }
2963         }
2964
2965 -       spin_unlock_irqrestore(&head->lock, flags);
2966 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2967         return 0;
2968  }
2969
2970 @@ -267,7 +267,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2971         unsigned long flags;
2972         int rc = 0;
2973
2974 -       spin_lock_irqsave(&head->lock, flags);
2975 +       raw_spin_lock_irqsave(&head->lock, flags);
2976
2977         if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
2978                 rc = 1;
2979 @@ -279,7 +279,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
2980                         uv_rtc_find_next_timer(head, pnode);
2981         }
2982
2983 -       spin_unlock_irqrestore(&head->lock, flags);
2984 +       raw_spin_unlock_irqrestore(&head->lock, flags);
2985
2986         return rc;
2987  }
2988 @@ -299,13 +299,18 @@ static int uv_rtc_unset_timer(int cpu, int force)
2989  static cycle_t uv_read_rtc(struct clocksource *cs)
2990  {
2991         unsigned long offset;
2992 +       cycle_t cycles;
2993
2994 +       preempt_disable();
2995         if (uv_get_min_hub_revision_id() == 1)
2996                 offset = 0;
2997         else
2998                 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
2999
3000 -       return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3001 +       cycles = (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
3002 +       preempt_enable();
3003 +
3004 +       return cycles;
3005  }
3006
3007  /*
3008 diff --git a/block/blk-core.c b/block/blk-core.c
3009 index 36c7ac328d8c..caa5fc1be2a2 100644
3010 --- a/block/blk-core.c
3011 +++ b/block/blk-core.c
3012 @@ -125,6 +125,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
3013
3014         INIT_LIST_HEAD(&rq->queuelist);
3015         INIT_LIST_HEAD(&rq->timeout_list);
3016 +#ifdef CONFIG_PREEMPT_RT_FULL
3017 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3018 +#endif
3019         rq->cpu = -1;
3020         rq->q = q;
3021         rq->__sector = (sector_t) -1;
3022 @@ -233,7 +236,7 @@ EXPORT_SYMBOL(blk_start_queue_async);
3023   **/
3024  void blk_start_queue(struct request_queue *q)
3025  {
3026 -       WARN_ON(!irqs_disabled());
3027 +       WARN_ON_NONRT(!irqs_disabled());
3028
3029         queue_flag_clear(QUEUE_FLAG_STOPPED, q);
3030         __blk_run_queue(q);
3031 @@ -659,7 +662,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
3032                 if (nowait)
3033                         return -EBUSY;
3034
3035 -               ret = wait_event_interruptible(q->mq_freeze_wq,
3036 +               ret = swait_event_interruptible(q->mq_freeze_wq,
3037                                 !atomic_read(&q->mq_freeze_depth) ||
3038                                 blk_queue_dying(q));
3039                 if (blk_queue_dying(q))
3040 @@ -679,7 +682,7 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref)
3041         struct request_queue *q =
3042                 container_of(ref, struct request_queue, q_usage_counter);
3043
3044 -       wake_up_all(&q->mq_freeze_wq);
3045 +       swake_up_all(&q->mq_freeze_wq);
3046  }
3047
3048  static void blk_rq_timed_out_timer(unsigned long data)
3049 @@ -748,7 +751,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
3050         q->bypass_depth = 1;
3051         __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
3052
3053 -       init_waitqueue_head(&q->mq_freeze_wq);
3054 +       init_swait_queue_head(&q->mq_freeze_wq);
3055
3056         /*
3057          * Init percpu_ref in atomic mode so that it's faster to shutdown.
3058 @@ -3171,7 +3174,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
3059                 blk_run_queue_async(q);
3060         else
3061                 __blk_run_queue(q);
3062 -       spin_unlock(q->queue_lock);
3063 +       spin_unlock_irq(q->queue_lock);
3064  }
3065
3066  static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
3067 @@ -3219,7 +3222,6 @@ EXPORT_SYMBOL(blk_check_plugged);
3068  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3069  {
3070         struct request_queue *q;
3071 -       unsigned long flags;
3072         struct request *rq;
3073         LIST_HEAD(list);
3074         unsigned int depth;
3075 @@ -3239,11 +3241,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3076         q = NULL;
3077         depth = 0;
3078
3079 -       /*
3080 -        * Save and disable interrupts here, to avoid doing it for every
3081 -        * queue lock we have to take.
3082 -        */
3083 -       local_irq_save(flags);
3084         while (!list_empty(&list)) {
3085                 rq = list_entry_rq(list.next);
3086                 list_del_init(&rq->queuelist);
3087 @@ -3256,7 +3253,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3088                                 queue_unplugged(q, depth, from_schedule);
3089                         q = rq->q;
3090                         depth = 0;
3091 -                       spin_lock(q->queue_lock);
3092 +                       spin_lock_irq(q->queue_lock);
3093                 }
3094
3095                 /*
3096 @@ -3283,8 +3280,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3097          */
3098         if (q)
3099                 queue_unplugged(q, depth, from_schedule);
3100 -
3101 -       local_irq_restore(flags);
3102  }
3103
3104  void blk_finish_plug(struct blk_plug *plug)
3105 diff --git a/block/blk-ioc.c b/block/blk-ioc.c
3106 index 381cb50a673c..dc8785233d94 100644
3107 --- a/block/blk-ioc.c
3108 +++ b/block/blk-ioc.c
3109 @@ -7,6 +7,7 @@
3110  #include <linux/bio.h>
3111  #include <linux/blkdev.h>
3112  #include <linux/slab.h>
3113 +#include <linux/delay.h>
3114
3115  #include "blk.h"
3116
3117 @@ -109,7 +110,7 @@ static void ioc_release_fn(struct work_struct *work)
3118                         spin_unlock(q->queue_lock);
3119                 } else {
3120                         spin_unlock_irqrestore(&ioc->lock, flags);
3121 -                       cpu_relax();
3122 +                       cpu_chill();
3123                         spin_lock_irqsave_nested(&ioc->lock, flags, 1);
3124                 }
3125         }
3126 @@ -187,7 +188,7 @@ void put_io_context_active(struct io_context *ioc)
3127                         spin_unlock(icq->q->queue_lock);
3128                 } else {
3129                         spin_unlock_irqrestore(&ioc->lock, flags);
3130 -                       cpu_relax();
3131 +                       cpu_chill();
3132                         goto retry;
3133                 }
3134         }
3135 diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
3136 index bb3ed488f7b5..628c6c13c482 100644
3137 --- a/block/blk-mq-cpu.c
3138 +++ b/block/blk-mq-cpu.c
3139 @@ -16,7 +16,7 @@
3140  #include "blk-mq.h"
3141
3142  static LIST_HEAD(blk_mq_cpu_notify_list);
3143 -static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
3144 +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
3145
3146  static int blk_mq_main_cpu_notify(struct notifier_block *self,
3147                                   unsigned long action, void *hcpu)
3148 @@ -25,7 +25,10 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
3149         struct blk_mq_cpu_notifier *notify;
3150         int ret = NOTIFY_OK;
3151
3152 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
3153 +       if (action != CPU_POST_DEAD)
3154 +               return NOTIFY_OK;
3155 +
3156 +       spin_lock(&blk_mq_cpu_notify_lock);
3157
3158         list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
3159                 ret = notify->notify(notify->data, action, cpu);
3160 @@ -33,7 +36,7 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
3161                         break;
3162         }
3163
3164 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
3165 +       spin_unlock(&blk_mq_cpu_notify_lock);
3166         return ret;
3167  }
3168
3169 @@ -41,16 +44,16 @@ void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
3170  {
3171         BUG_ON(!notifier->notify);
3172
3173 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
3174 +       spin_lock(&blk_mq_cpu_notify_lock);
3175         list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
3176 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
3177 +       spin_unlock(&blk_mq_cpu_notify_lock);
3178  }
3179
3180  void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
3181  {
3182 -       raw_spin_lock(&blk_mq_cpu_notify_lock);
3183 +       spin_lock(&blk_mq_cpu_notify_lock);
3184         list_del(&notifier->list);
3185 -       raw_spin_unlock(&blk_mq_cpu_notify_lock);
3186 +       spin_unlock(&blk_mq_cpu_notify_lock);
3187  }
3188
3189  void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
3190 diff --git a/block/blk-mq.c b/block/blk-mq.c
3191 index c207fa9870eb..ac71b0455e9f 100644
3192 --- a/block/blk-mq.c
3193 +++ b/block/blk-mq.c
3194 @@ -92,7 +92,7 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
3195
3196  static void blk_mq_freeze_queue_wait(struct request_queue *q)
3197  {
3198 -       wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3199 +       swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
3200  }
3201
3202  /*
3203 @@ -130,7 +130,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
3204         WARN_ON_ONCE(freeze_depth < 0);
3205         if (!freeze_depth) {
3206                 percpu_ref_reinit(&q->q_usage_counter);
3207 -               wake_up_all(&q->mq_freeze_wq);
3208 +               swake_up_all(&q->mq_freeze_wq);
3209         }
3210  }
3211  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
3212 @@ -149,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q)
3213          * dying, we need to ensure that processes currently waiting on
3214          * the queue are notified as well.
3215          */
3216 -       wake_up_all(&q->mq_freeze_wq);
3217 +       swake_up_all(&q->mq_freeze_wq);
3218  }
3219
3220  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
3221 @@ -197,6 +197,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
3222         rq->resid_len = 0;
3223         rq->sense = NULL;
3224
3225 +#ifdef CONFIG_PREEMPT_RT_FULL
3226 +       INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work);
3227 +#endif
3228         INIT_LIST_HEAD(&rq->timeout_list);
3229         rq->timeout = 0;
3230
3231 @@ -379,6 +382,17 @@ void blk_mq_end_request(struct request *rq, int error)
3232  }
3233  EXPORT_SYMBOL(blk_mq_end_request);
3234
3235 +#ifdef CONFIG_PREEMPT_RT_FULL
3236 +
3237 +void __blk_mq_complete_request_remote_work(struct work_struct *work)
3238 +{
3239 +       struct request *rq = container_of(work, struct request, work);
3240 +
3241 +       rq->q->softirq_done_fn(rq);
3242 +}
3243 +
3244 +#else
3245 +
3246  static void __blk_mq_complete_request_remote(void *data)
3247  {
3248         struct request *rq = data;
3249 @@ -386,6 +400,8 @@ static void __blk_mq_complete_request_remote(void *data)
3250         rq->q->softirq_done_fn(rq);
3251  }
3252
3253 +#endif
3254 +
3255  static void blk_mq_ipi_complete_request(struct request *rq)
3256  {
3257         struct blk_mq_ctx *ctx = rq->mq_ctx;
3258 @@ -397,19 +413,23 @@ static void blk_mq_ipi_complete_request(struct request *rq)
3259                 return;
3260         }
3261
3262 -       cpu = get_cpu();
3263 +       cpu = get_cpu_light();
3264         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
3265                 shared = cpus_share_cache(cpu, ctx->cpu);
3266
3267         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
3268 +#ifdef CONFIG_PREEMPT_RT_FULL
3269 +               schedule_work_on(ctx->cpu, &rq->work);
3270 +#else
3271                 rq->csd.func = __blk_mq_complete_request_remote;
3272                 rq->csd.info = rq;
3273                 rq->csd.flags = 0;
3274                 smp_call_function_single_async(ctx->cpu, &rq->csd);
3275 +#endif
3276         } else {
3277                 rq->q->softirq_done_fn(rq);
3278         }
3279 -       put_cpu();
3280 +       put_cpu_light();
3281  }
3282
3283  static void __blk_mq_complete_request(struct request *rq)
3284 @@ -938,14 +958,14 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
3285                 return;
3286
3287         if (!async) {
3288 -               int cpu = get_cpu();
3289 +               int cpu = get_cpu_light();
3290                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
3291                         __blk_mq_run_hw_queue(hctx);
3292 -                       put_cpu();
3293 +                       put_cpu_light();
3294                         return;
3295                 }
3296
3297 -               put_cpu();
3298 +               put_cpu_light();
3299         }
3300
3301         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
3302 @@ -1667,7 +1687,7 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
3303  {
3304         struct blk_mq_hw_ctx *hctx = data;
3305
3306 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3307 +       if (action == CPU_POST_DEAD)
3308                 return blk_mq_hctx_cpu_offline(hctx, cpu);
3309
3310         /*
3311 diff --git a/block/blk-mq.h b/block/blk-mq.h
3312 index 9087b11037b7..0401d76e827c 100644
3313 --- a/block/blk-mq.h
3314 +++ b/block/blk-mq.h
3315 @@ -86,12 +86,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
3316   */
3317  static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
3318  {
3319 -       return __blk_mq_get_ctx(q, get_cpu());
3320 +       return __blk_mq_get_ctx(q, get_cpu_light());
3321  }
3322
3323  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
3324  {
3325 -       put_cpu();
3326 +       put_cpu_light();
3327  }
3328
3329  struct blk_mq_alloc_data {
3330 diff --git a/block/blk-softirq.c b/block/blk-softirq.c
3331 index 53b1737e978d..81c3c0a62edf 100644
3332 --- a/block/blk-softirq.c
3333 +++ b/block/blk-softirq.c
3334 @@ -51,6 +51,7 @@ static void trigger_softirq(void *data)
3335                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3336
3337         local_irq_restore(flags);
3338 +       preempt_check_resched_rt();
3339  }
3340
3341  /*
3342 @@ -93,6 +94,7 @@ static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
3343                                  this_cpu_ptr(&blk_cpu_done));
3344                 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3345                 local_irq_enable();
3346 +               preempt_check_resched_rt();
3347         }
3348
3349         return NOTIFY_OK;
3350 @@ -150,6 +152,7 @@ void __blk_complete_request(struct request *req)
3351                 goto do_local;
3352
3353         local_irq_restore(flags);
3354 +       preempt_check_resched_rt();
3355  }
3356
3357  /**
3358 diff --git a/block/bounce.c b/block/bounce.c
3359 index 1cb5dd3a5da1..2f1ec8a67cbe 100644
3360 --- a/block/bounce.c
3361 +++ b/block/bounce.c
3362 @@ -55,11 +55,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
3363         unsigned long flags;
3364         unsigned char *vto;
3365
3366 -       local_irq_save(flags);
3367 +       local_irq_save_nort(flags);
3368         vto = kmap_atomic(to->bv_page);
3369         memcpy(vto + to->bv_offset, vfrom, to->bv_len);
3370         kunmap_atomic(vto);
3371 -       local_irq_restore(flags);
3372 +       local_irq_restore_nort(flags);
3373  }
3374
3375  #else /* CONFIG_HIGHMEM */
3376 diff --git a/crypto/algapi.c b/crypto/algapi.c
3377 index df939b54b09f..efe5e06adcf7 100644
3378 --- a/crypto/algapi.c
3379 +++ b/crypto/algapi.c
3380 @@ -718,13 +718,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2);
3381
3382  int crypto_register_notifier(struct notifier_block *nb)
3383  {
3384 -       return blocking_notifier_chain_register(&crypto_chain, nb);
3385 +       return srcu_notifier_chain_register(&crypto_chain, nb);
3386  }
3387  EXPORT_SYMBOL_GPL(crypto_register_notifier);
3388
3389  int crypto_unregister_notifier(struct notifier_block *nb)
3390  {
3391 -       return blocking_notifier_chain_unregister(&crypto_chain, nb);
3392 +       return srcu_notifier_chain_unregister(&crypto_chain, nb);
3393  }
3394  EXPORT_SYMBOL_GPL(crypto_unregister_notifier);
3395
3396 diff --git a/crypto/api.c b/crypto/api.c
3397 index bbc147cb5dec..bc1a848f02ec 100644
3398 --- a/crypto/api.c
3399 +++ b/crypto/api.c
3400 @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list);
3401  DECLARE_RWSEM(crypto_alg_sem);
3402  EXPORT_SYMBOL_GPL(crypto_alg_sem);
3403
3404 -BLOCKING_NOTIFIER_HEAD(crypto_chain);
3405 +SRCU_NOTIFIER_HEAD(crypto_chain);
3406  EXPORT_SYMBOL_GPL(crypto_chain);
3407
3408  static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg);
3409 @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v)
3410  {
3411         int ok;
3412
3413 -       ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3414 +       ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3415         if (ok == NOTIFY_DONE) {
3416                 request_module("cryptomgr");
3417 -               ok = blocking_notifier_call_chain(&crypto_chain, val, v);
3418 +               ok = srcu_notifier_call_chain(&crypto_chain, val, v);
3419         }
3420
3421         return ok;
3422 diff --git a/crypto/internal.h b/crypto/internal.h
3423 index 7eefcdb00227..0ecc7f5a2f40 100644
3424 --- a/crypto/internal.h
3425 +++ b/crypto/internal.h
3426 @@ -47,7 +47,7 @@ struct crypto_larval {
3427
3428  extern struct list_head crypto_alg_list;
3429  extern struct rw_semaphore crypto_alg_sem;
3430 -extern struct blocking_notifier_head crypto_chain;
3431 +extern struct srcu_notifier_head crypto_chain;
3432
3433  #ifdef CONFIG_PROC_FS
3434  void __init crypto_init_proc(void);
3435 @@ -146,7 +146,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg)
3436
3437  static inline void crypto_notify(unsigned long val, void *v)
3438  {
3439 -       blocking_notifier_call_chain(&crypto_chain, val, v);
3440 +       srcu_notifier_call_chain(&crypto_chain, val, v);
3441  }
3442
3443  #endif /* _CRYPTO_INTERNAL_H */
3444 diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
3445 index fded776236e2..bda523219d50 100644
3446 --- a/drivers/acpi/acpica/acglobal.h
3447 +++ b/drivers/acpi/acpica/acglobal.h
3448 @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending);
3449   * interrupt level
3450   */
3451  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */
3452 -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock);    /* For ACPI H/W except GPE registers */
3453 +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock);        /* For ACPI H/W except GPE registers */
3454  ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock);
3455
3456  /* Mutex for _OSI support */
3457 diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
3458 index 3b7fb99362b6..696bf8e62afb 100644
3459 --- a/drivers/acpi/acpica/hwregs.c
3460 +++ b/drivers/acpi/acpica/hwregs.c
3461 @@ -363,14 +363,14 @@ acpi_status acpi_hw_clear_acpi_status(void)
3462                           ACPI_BITMASK_ALL_FIXED_STATUS,
3463                           ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address)));
3464
3465 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3466 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3467
3468         /* Clear the fixed events in PM1 A/B */
3469
3470         status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
3471                                         ACPI_BITMASK_ALL_FIXED_STATUS);
3472
3473 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3474 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3475
3476         if (ACPI_FAILURE(status)) {
3477                 goto exit;
3478 diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
3479 index 98c26ff39409..6e236f2ea791 100644
3480 --- a/drivers/acpi/acpica/hwxface.c
3481 +++ b/drivers/acpi/acpica/hwxface.c
3482 @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3483                 return_ACPI_STATUS(AE_BAD_PARAMETER);
3484         }
3485
3486 -       lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
3487 +       raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags);
3488
3489         /*
3490          * At this point, we know that the parent register is one of the
3491 @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value)
3492
3493  unlock_and_exit:
3494
3495 -       acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
3496 +       raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags);
3497         return_ACPI_STATUS(status);
3498  }
3499
3500 diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
3501 index 15073375bd00..357e7ca5a587 100644
3502 --- a/drivers/acpi/acpica/utmutex.c
3503 +++ b/drivers/acpi/acpica/utmutex.c
3504 @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void)
3505                 return_ACPI_STATUS (status);
3506         }
3507
3508 -       status = acpi_os_create_lock (&acpi_gbl_hardware_lock);
3509 +       status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock);
3510         if (ACPI_FAILURE (status)) {
3511                 return_ACPI_STATUS (status);
3512         }
3513 @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void)
3514         /* Delete the spinlocks */
3515
3516         acpi_os_delete_lock(acpi_gbl_gpe_lock);
3517 -       acpi_os_delete_lock(acpi_gbl_hardware_lock);
3518 +       acpi_os_delete_raw_lock(acpi_gbl_hardware_lock);
3519         acpi_os_delete_lock(acpi_gbl_reference_count_lock);
3520
3521         /* Delete the reader/writer lock */
3522 diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
3523 index 051b6158d1b7..7ad293bef6ed 100644
3524 --- a/drivers/ata/libata-sff.c
3525 +++ b/drivers/ata/libata-sff.c
3526 @@ -678,9 +678,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_device *dev, unsigned char *buf,
3527         unsigned long flags;
3528         unsigned int consumed;
3529
3530 -       local_irq_save(flags);
3531 +       local_irq_save_nort(flags);
3532         consumed = ata_sff_data_xfer32(dev, buf, buflen, rw);
3533 -       local_irq_restore(flags);
3534 +       local_irq_restore_nort(flags);
3535
3536         return consumed;
3537  }
3538 @@ -719,7 +719,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3539                 unsigned long flags;
3540
3541                 /* FIXME: use a bounce buffer */
3542 -               local_irq_save(flags);
3543 +               local_irq_save_nort(flags);
3544                 buf = kmap_atomic(page);
3545
3546                 /* do the actual data transfer */
3547 @@ -727,7 +727,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
3548                                        do_write);
3549
3550                 kunmap_atomic(buf);
3551 -               local_irq_restore(flags);
3552 +               local_irq_restore_nort(flags);
3553         } else {
3554                 buf = page_address(page);
3555                 ap->ops->sff_data_xfer(qc->dev, buf + offset, qc->sect_size,
3556 @@ -864,7 +864,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3557                 unsigned long flags;
3558
3559                 /* FIXME: use bounce buffer */
3560 -               local_irq_save(flags);
3561 +               local_irq_save_nort(flags);
3562                 buf = kmap_atomic(page);
3563
3564                 /* do the actual data transfer */
3565 @@ -872,7 +872,7 @@ static int __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
3566                                                                 count, rw);
3567
3568                 kunmap_atomic(buf);
3569 -               local_irq_restore(flags);
3570 +               local_irq_restore_nort(flags);
3571         } else {
3572                 buf = page_address(page);
3573                 consumed = ap->ops->sff_data_xfer(dev,  buf + offset,
3574 diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
3575 index 4b5cd3a7b2b6..fa8329ad79fd 100644
3576 --- a/drivers/block/zram/zcomp.c
3577 +++ b/drivers/block/zram/zcomp.c
3578 @@ -118,12 +118,19 @@ ssize_t zcomp_available_show(const char *comp, char *buf)
3579
3580  struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
3581  {
3582 -       return *get_cpu_ptr(comp->stream);
3583 +       struct zcomp_strm *zstrm;
3584 +
3585 +       zstrm = *this_cpu_ptr(comp->stream);
3586 +       spin_lock(&zstrm->zcomp_lock);
3587 +       return zstrm;
3588  }
3589
3590  void zcomp_stream_put(struct zcomp *comp)
3591  {
3592 -       put_cpu_ptr(comp->stream);
3593 +       struct zcomp_strm *zstrm;
3594 +
3595 +       zstrm = *this_cpu_ptr(comp->stream);
3596 +       spin_unlock(&zstrm->zcomp_lock);
3597  }
3598
3599  int zcomp_compress(struct zcomp_strm *zstrm,
3600 @@ -174,6 +181,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
3601                         pr_err("Can't allocate a compression stream\n");
3602                         return NOTIFY_BAD;
3603                 }
3604 +               spin_lock_init(&zstrm->zcomp_lock);
3605                 *per_cpu_ptr(comp->stream, cpu) = zstrm;
3606                 break;
3607         case CPU_DEAD:
3608 diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
3609 index 478cac2ed465..f7a6efdc3285 100644
3610 --- a/drivers/block/zram/zcomp.h
3611 +++ b/drivers/block/zram/zcomp.h
3612 @@ -14,6 +14,7 @@ struct zcomp_strm {
3613         /* compression/decompression buffer */
3614         void *buffer;
3615         struct crypto_comp *tfm;
3616 +       spinlock_t zcomp_lock;
3617  };
3618
3619  /* dynamic per-device compression frontend */
3620 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
3621 index 04365b17ee67..b4a0577a4dbc 100644
3622 --- a/drivers/block/zram/zram_drv.c
3623 +++ b/drivers/block/zram/zram_drv.c
3624 @@ -519,6 +519,8 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
3625                 goto out_error;
3626         }
3627
3628 +       zram_meta_init_table_locks(meta, disksize);
3629 +
3630         return meta;
3631
3632  out_error:
3633 @@ -566,28 +568,28 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
3634         struct zram_meta *meta = zram->meta;
3635         unsigned long handle;
3636         unsigned int size;
3637 +       struct zcomp_strm *zstrm;
3638
3639 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3640 +       zram_lock_table(&meta->table[index]);
3641         handle = meta->table[index].handle;
3642         size = zram_get_obj_size(meta, index);
3643
3644         if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
3645 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3646 +               zram_unlock_table(&meta->table[index]);
3647                 clear_page(mem);
3648                 return 0;
3649         }
3650
3651 +       zstrm = zcomp_stream_get(zram->comp);
3652         cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
3653         if (size == PAGE_SIZE) {
3654                 copy_page(mem, cmem);
3655         } else {
3656 -               struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
3657 -
3658                 ret = zcomp_decompress(zstrm, cmem, size, mem);
3659 -               zcomp_stream_put(zram->comp);
3660         }
3661         zs_unmap_object(meta->mem_pool, handle);
3662 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3663 +       zcomp_stream_put(zram->comp);
3664 +       zram_unlock_table(&meta->table[index]);
3665
3666         /* Should NEVER happen. Return bio error if it does. */
3667         if (unlikely(ret)) {
3668 @@ -607,14 +609,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
3669         struct zram_meta *meta = zram->meta;
3670         page = bvec->bv_page;
3671
3672 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3673 +       zram_lock_table(&meta->table[index]);
3674         if (unlikely(!meta->table[index].handle) ||
3675                         zram_test_flag(meta, index, ZRAM_ZERO)) {
3676 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3677 +               zram_unlock_table(&meta->table[index]);
3678                 handle_zero_page(bvec);
3679                 return 0;
3680         }
3681 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3682 +       zram_unlock_table(&meta->table[index]);
3683
3684         if (is_partial_io(bvec))
3685                 /* Use  a temporary buffer to decompress the page */
3686 @@ -691,10 +693,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3687                 if (user_mem)
3688                         kunmap_atomic(user_mem);
3689                 /* Free memory associated with this sector now. */
3690 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3691 +               zram_lock_table(&meta->table[index]);
3692                 zram_free_page(zram, index);
3693                 zram_set_flag(meta, index, ZRAM_ZERO);
3694 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3695 +               zram_unlock_table(&meta->table[index]);
3696
3697                 atomic64_inc(&zram->stats.zero_pages);
3698                 ret = 0;
3699 @@ -785,12 +787,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
3700          * Free memory associated with this sector
3701          * before overwriting unused sectors.
3702          */
3703 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3704 +       zram_lock_table(&meta->table[index]);
3705         zram_free_page(zram, index);
3706
3707         meta->table[index].handle = handle;
3708         zram_set_obj_size(meta, index, clen);
3709 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3710 +       zram_unlock_table(&meta->table[index]);
3711
3712         /* Update stats */
3713         atomic64_add(clen, &zram->stats.compr_data_size);
3714 @@ -833,9 +835,9 @@ static void zram_bio_discard(struct zram *zram, u32 index,
3715         }
3716
3717         while (n >= PAGE_SIZE) {
3718 -               bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3719 +               zram_lock_table(&meta->table[index]);
3720                 zram_free_page(zram, index);
3721 -               bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3722 +               zram_unlock_table(&meta->table[index]);
3723                 atomic64_inc(&zram->stats.notify_free);
3724                 index++;
3725                 n -= PAGE_SIZE;
3726 @@ -964,9 +966,9 @@ static void zram_slot_free_notify(struct block_device *bdev,
3727         zram = bdev->bd_disk->private_data;
3728         meta = zram->meta;
3729
3730 -       bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
3731 +       zram_lock_table(&meta->table[index]);
3732         zram_free_page(zram, index);
3733 -       bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
3734 +       zram_unlock_table(&meta->table[index]);
3735         atomic64_inc(&zram->stats.notify_free);
3736  }
3737
3738 diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
3739 index 74fcf10da374..fd4020c99b9e 100644
3740 --- a/drivers/block/zram/zram_drv.h
3741 +++ b/drivers/block/zram/zram_drv.h
3742 @@ -73,6 +73,9 @@ enum zram_pageflags {
3743  struct zram_table_entry {
3744         unsigned long handle;
3745         unsigned long value;
3746 +#ifdef CONFIG_PREEMPT_RT_BASE
3747 +       spinlock_t lock;
3748 +#endif
3749  };
3750
3751  struct zram_stats {
3752 @@ -120,4 +123,42 @@ struct zram {
3753          */
3754         bool claim; /* Protected by bdev->bd_mutex */
3755  };
3756 +
3757 +#ifndef CONFIG_PREEMPT_RT_BASE
3758 +static inline void zram_lock_table(struct zram_table_entry *table)
3759 +{
3760 +       bit_spin_lock(ZRAM_ACCESS, &table->value);
3761 +}
3762 +
3763 +static inline void zram_unlock_table(struct zram_table_entry *table)
3764 +{
3765 +       bit_spin_unlock(ZRAM_ACCESS, &table->value);
3766 +}
3767 +
3768 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { }
3769 +#else /* CONFIG_PREEMPT_RT_BASE */
3770 +static inline void zram_lock_table(struct zram_table_entry *table)
3771 +{
3772 +       spin_lock(&table->lock);
3773 +       __set_bit(ZRAM_ACCESS, &table->value);
3774 +}
3775 +
3776 +static inline void zram_unlock_table(struct zram_table_entry *table)
3777 +{
3778 +       __clear_bit(ZRAM_ACCESS, &table->value);
3779 +       spin_unlock(&table->lock);
3780 +}
3781 +
3782 +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize)
3783 +{
3784 +        size_t num_pages = disksize >> PAGE_SHIFT;
3785 +        size_t index;
3786 +
3787 +        for (index = 0; index < num_pages; index++) {
3788 +               spinlock_t *lock = &meta->table[index].lock;
3789 +               spin_lock_init(lock);
3790 +        }
3791 +}
3792 +#endif /* CONFIG_PREEMPT_RT_BASE */
3793 +
3794  #endif
3795 diff --git a/drivers/char/random.c b/drivers/char/random.c
3796 index 3efb3bf0ab83..c894d2e266f3 100644
3797 --- a/drivers/char/random.c
3798 +++ b/drivers/char/random.c
3799 @@ -1028,8 +1028,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3800         } sample;
3801         long delta, delta2, delta3;
3802
3803 -       preempt_disable();
3804 -
3805         sample.jiffies = jiffies;
3806         sample.cycles = random_get_entropy();
3807         sample.num = num;
3808 @@ -1070,7 +1068,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num)
3809                  */
3810                 credit_entropy_bits(r, min_t(int, fls(delta>>1), 11));
3811         }
3812 -       preempt_enable();
3813  }
3814
3815  void add_input_randomness(unsigned int type, unsigned int code,
3816 @@ -1123,28 +1120,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs)
3817         return *(ptr + f->reg_idx++);
3818  }
3819
3820 -void add_interrupt_randomness(int irq, int irq_flags)
3821 +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip)
3822  {
3823         struct entropy_store    *r;
3824         struct fast_pool        *fast_pool = this_cpu_ptr(&irq_randomness);
3825 -       struct pt_regs          *regs = get_irq_regs();
3826         unsigned long           now = jiffies;
3827         cycles_t                cycles = random_get_entropy();
3828         __u32                   c_high, j_high;
3829 -       __u64                   ip;
3830         unsigned long           seed;
3831         int                     credit = 0;
3832
3833         if (cycles == 0)
3834 -               cycles = get_reg(fast_pool, regs);
3835 +               cycles = get_reg(fast_pool, NULL);
3836         c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0;
3837         j_high = (sizeof(now) > 4) ? now >> 32 : 0;
3838         fast_pool->pool[0] ^= cycles ^ j_high ^ irq;
3839         fast_pool->pool[1] ^= now ^ c_high;
3840 -       ip = regs ? instruction_pointer(regs) : _RET_IP_;
3841 +       if (!ip)
3842 +               ip = _RET_IP_;
3843         fast_pool->pool[2] ^= ip;
3844         fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 :
3845 -               get_reg(fast_pool, regs);
3846 +               get_reg(fast_pool, NULL);
3847
3848         fast_mix(fast_pool);
3849         add_interrupt_bench(cycles);
3850 diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
3851 index 4da2af9694a2..5b6f57f500b8 100644
3852 --- a/drivers/clocksource/tcb_clksrc.c
3853 +++ b/drivers/clocksource/tcb_clksrc.c
3854 @@ -23,8 +23,7 @@
3855   *     this 32 bit free-running counter. the second channel is not used.
3856   *
3857   *   - The third channel may be used to provide a 16-bit clockevent
3858 - *     source, used in either periodic or oneshot mode.  This runs
3859 - *     at 32 KiHZ, and can handle delays of up to two seconds.
3860 + *     source, used in either periodic or oneshot mode.
3861   *
3862   * A boot clocksource and clockevent source are also currently needed,
3863   * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so
3864 @@ -74,6 +73,8 @@ static struct clocksource clksrc = {
3865  struct tc_clkevt_device {
3866         struct clock_event_device       clkevt;
3867         struct clk                      *clk;
3868 +       bool                            clk_enabled;
3869 +       u32                             freq;
3870         void __iomem                    *regs;
3871  };
3872
3873 @@ -82,15 +83,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt)
3874         return container_of(clkevt, struct tc_clkevt_device, clkevt);
3875  }
3876
3877 -/* For now, we always use the 32K clock ... this optimizes for NO_HZ,
3878 - * because using one of the divided clocks would usually mean the
3879 - * tick rate can never be less than several dozen Hz (vs 0.5 Hz).
3880 - *
3881 - * A divided clock could be good for high resolution timers, since
3882 - * 30.5 usec resolution can seem "low".
3883 - */
3884  static u32 timer_clock;
3885
3886 +static void tc_clk_disable(struct clock_event_device *d)
3887 +{
3888 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3889 +
3890 +       clk_disable(tcd->clk);
3891 +       tcd->clk_enabled = false;
3892 +}
3893 +
3894 +static void tc_clk_enable(struct clock_event_device *d)
3895 +{
3896 +       struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3897 +
3898 +       if (tcd->clk_enabled)
3899 +               return;
3900 +       clk_enable(tcd->clk);
3901 +       tcd->clk_enabled = true;
3902 +}
3903 +
3904  static int tc_shutdown(struct clock_event_device *d)
3905  {
3906         struct tc_clkevt_device *tcd = to_tc_clkevt(d);
3907 @@ -98,8 +110,14 @@ static int tc_shutdown(struct clock_event_device *d)
3908
3909         __raw_writel(0xff, regs + ATMEL_TC_REG(2, IDR));
3910         __raw_writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR));
3911 +       return 0;
3912 +}
3913 +
3914 +static int tc_shutdown_clk_off(struct clock_event_device *d)
3915 +{
3916 +       tc_shutdown(d);
3917         if (!clockevent_state_detached(d))
3918 -               clk_disable(tcd->clk);
3919 +               tc_clk_disable(d);
3920
3921         return 0;
3922  }
3923 @@ -112,9 +130,9 @@ static int tc_set_oneshot(struct clock_event_device *d)
3924         if (clockevent_state_oneshot(d) || clockevent_state_periodic(d))
3925                 tc_shutdown(d);
3926
3927 -       clk_enable(tcd->clk);
3928 +       tc_clk_enable(d);
3929
3930 -       /* slow clock, count up to RC, then irq and stop */
3931 +       /* count up to RC, then irq and stop */
3932         __raw_writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE |
3933                      ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR));
3934         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3935 @@ -134,12 +152,12 @@ static int tc_set_periodic(struct clock_event_device *d)
3936         /* By not making the gentime core emulate periodic mode on top
3937          * of oneshot, we get lower overhead and improved accuracy.
3938          */
3939 -       clk_enable(tcd->clk);
3940 +       tc_clk_enable(d);
3941
3942 -       /* slow clock, count up to RC, then irq and restart */
3943 +       /* count up to RC, then irq and restart */
3944         __raw_writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO,
3945                      regs + ATMEL_TC_REG(2, CMR));
3946 -       __raw_writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3947 +       __raw_writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC));
3948
3949         /* Enable clock and interrupts on RC compare */
3950         __raw_writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER));
3951 @@ -166,9 +184,13 @@ static struct tc_clkevt_device clkevt = {
3952                 .features               = CLOCK_EVT_FEAT_PERIODIC |
3953                                           CLOCK_EVT_FEAT_ONESHOT,
3954                 /* Should be lower than at91rm9200's system timer */
3955 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
3956                 .rating                 = 125,
3957 +#else
3958 +               .rating                 = 200,
3959 +#endif
3960                 .set_next_event         = tc_next_event,
3961 -               .set_state_shutdown     = tc_shutdown,
3962 +               .set_state_shutdown     = tc_shutdown_clk_off,
3963                 .set_state_periodic     = tc_set_periodic,
3964                 .set_state_oneshot      = tc_set_oneshot,
3965         },
3966 @@ -188,8 +210,9 @@ static irqreturn_t ch2_irq(int irq, void *handle)
3967         return IRQ_NONE;
3968  }
3969
3970 -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3971 +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx)
3972  {
3973 +       unsigned divisor = atmel_tc_divisors[divisor_idx];
3974         int ret;
3975         struct clk *t2_clk = tc->clk[2];
3976         int irq = tc->irq[2];
3977 @@ -210,7 +233,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3978         clkevt.regs = tc->regs;
3979         clkevt.clk = t2_clk;
3980
3981 -       timer_clock = clk32k_divisor_idx;
3982 +       timer_clock = divisor_idx;
3983 +       if (!divisor)
3984 +               clkevt.freq = 32768;
3985 +       else
3986 +               clkevt.freq = clk_get_rate(t2_clk) / divisor;
3987
3988         clkevt.clkevt.cpumask = cpumask_of(0);
3989
3990 @@ -221,7 +248,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
3991                 return ret;
3992         }
3993
3994 -       clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff);
3995 +       clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff);
3996
3997         return ret;
3998  }
3999 @@ -358,7 +385,11 @@ static int __init tcb_clksrc_init(void)
4000                 goto err_disable_t1;
4001
4002         /* channel 2:  periodic and oneshot timer support */
4003 +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4004         ret = setup_clkevents(tc, clk32k_divisor_idx);
4005 +#else
4006 +       ret = setup_clkevents(tc, best_divisor_idx);
4007 +#endif
4008         if (ret)
4009                 goto err_unregister_clksrc;
4010
4011 diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
4012 index 7f0f5b26d8c5..1553f19e73e7 100644
4013 --- a/drivers/clocksource/timer-atmel-pit.c
4014 +++ b/drivers/clocksource/timer-atmel-pit.c
4015 @@ -46,6 +46,7 @@ struct pit_data {
4016         u32             cycle;
4017         u32             cnt;
4018         unsigned int    irq;
4019 +       bool            irq_requested;
4020         struct clk      *mck;
4021  };
4022
4023 @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev)
4024
4025         /* disable irq, leaving the clocksource active */
4026         pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN);
4027 +       if (data->irq_requested) {
4028 +               free_irq(data->irq, data);
4029 +               data->irq_requested = false;
4030 +       }
4031         return 0;
4032  }
4033
4034 +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id);
4035  /*
4036   * Clockevent device:  interrupts every 1/HZ (== pit_cycles * MCK/16)
4037   */
4038  static int pit_clkevt_set_periodic(struct clock_event_device *dev)
4039  {
4040         struct pit_data *data = clkevt_to_pit_data(dev);
4041 +       int ret;
4042 +
4043 +       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4044 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4045 +                         "at91_tick", data);
4046 +       if (ret)
4047 +               panic(pr_fmt("Unable to setup IRQ\n"));
4048 +
4049 +       data->irq_requested = true;
4050
4051         /* update clocksource counter */
4052         data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR));
4053 @@ -211,15 +226,6 @@ static int __init at91sam926x_pit_common_init(struct pit_data *data)
4054                 return ret;
4055         }
4056
4057 -       /* Set up irq handler */
4058 -       ret = request_irq(data->irq, at91sam926x_pit_interrupt,
4059 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4060 -                         "at91_tick", data);
4061 -       if (ret) {
4062 -               pr_err("Unable to setup IRQ\n");
4063 -               return ret;
4064 -       }
4065 -
4066         /* Set up and register clockevents */
4067         data->clkevt.name = "pit";
4068         data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC;
4069 diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
4070 index e90ab5b63a90..9e124087c55f 100644
4071 --- a/drivers/clocksource/timer-atmel-st.c
4072 +++ b/drivers/clocksource/timer-atmel-st.c
4073 @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void)
4074         last_crtr = read_CRTR();
4075  }
4076
4077 +static int atmel_st_irq;
4078 +
4079  static int clkevt32k_shutdown(struct clock_event_device *evt)
4080  {
4081         clkdev32k_disable_and_flush_irq();
4082         irqmask = 0;
4083         regmap_write(regmap_st, AT91_ST_IER, irqmask);
4084 +       free_irq(atmel_st_irq, regmap_st);
4085         return 0;
4086  }
4087
4088  static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4089  {
4090 +       int ret;
4091 +
4092         clkdev32k_disable_and_flush_irq();
4093
4094 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4095 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4096 +                         "at91_tick", regmap_st);
4097 +       if (ret)
4098 +               panic(pr_fmt("Unable to setup IRQ\n"));
4099 +
4100         /*
4101          * ALM for oneshot irqs, set by next_event()
4102          * before 32 seconds have passed.
4103 @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev)
4104
4105  static int clkevt32k_set_periodic(struct clock_event_device *dev)
4106  {
4107 +       int ret;
4108 +
4109         clkdev32k_disable_and_flush_irq();
4110
4111 +       ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt,
4112 +                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4113 +                         "at91_tick", regmap_st);
4114 +       if (ret)
4115 +               panic(pr_fmt("Unable to setup IRQ\n"));
4116 +
4117         /* PIT for periodic irqs; fixed rate of 1/HZ */
4118         irqmask = AT91_ST_PITS;
4119         regmap_write(regmap_st, AT91_ST_PIMR, timer_latch);
4120 @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node)
4121  {
4122         struct clk *sclk;
4123         unsigned int sclk_rate, val;
4124 -       int irq, ret;
4125 +       int ret;
4126
4127         regmap_st = syscon_node_to_regmap(node);
4128         if (IS_ERR(regmap_st)) {
4129 @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node)
4130         regmap_read(regmap_st, AT91_ST_SR, &val);
4131
4132         /* Get the interrupts property */
4133 -       irq  = irq_of_parse_and_map(node, 0);
4134 -       if (!irq) {
4135 +       atmel_st_irq  = irq_of_parse_and_map(node, 0);
4136 +       if (!atmel_st_irq) {
4137                 pr_err("Unable to get IRQ from DT\n");
4138                 return -EINVAL;
4139         }
4140
4141 -       /* Make IRQs happen for the system timer */
4142 -       ret = request_irq(irq, at91rm9200_timer_interrupt,
4143 -                         IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL,
4144 -                         "at91_tick", regmap_st);
4145 -       if (ret) {
4146 -               pr_err("Unable to setup IRQ\n");
4147 -               return ret;
4148 -       }
4149 -
4150         sclk = of_clk_get(node, 0);
4151         if (IS_ERR(sclk)) {
4152                 pr_err("Unable to get slow clock\n");
4153 diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
4154 index a782ce87715c..19d265948526 100644
4155 --- a/drivers/connector/cn_proc.c
4156 +++ b/drivers/connector/cn_proc.c
4157 @@ -32,6 +32,7 @@
4158  #include <linux/pid_namespace.h>
4159
4160  #include <linux/cn_proc.h>
4161 +#include <linux/locallock.h>
4162
4163  /*
4164   * Size of a cn_msg followed by a proc_event structure.  Since the
4165 @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
4166
4167  /* proc_event_counts is used as the sequence number of the netlink message */
4168  static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
4169 +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock);
4170
4171  static inline void send_msg(struct cn_msg *msg)
4172  {
4173 -       preempt_disable();
4174 +       local_lock(send_msg_lock);
4175
4176         msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
4177         ((struct proc_event *)msg->data)->cpu = smp_processor_id();
4178 @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg)
4179          */
4180         cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
4181
4182 -       preempt_enable();
4183 +       local_unlock(send_msg_lock);
4184  }
4185
4186  void proc_fork_connector(struct task_struct *task)
4187 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
4188 index adbd1de1cea5..1fac5074f2cf 100644
4189 --- a/drivers/cpufreq/Kconfig.x86
4190 +++ b/drivers/cpufreq/Kconfig.x86
4191 @@ -124,7 +124,7 @@ config X86_POWERNOW_K7_ACPI
4192
4193  config X86_POWERNOW_K8
4194         tristate "AMD Opteron/Athlon64 PowerNow!"
4195 -       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ
4196 +       depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE
4197         help
4198           This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors.
4199           Support for K10 and newer processors is now in acpi-cpufreq.
4200 diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4201 index b35e5b6475b2..ce60807fb1d4 100644
4202 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4203 +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
4204 @@ -1302,7 +1302,9 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params,
4205         if (ret)
4206                 return ret;
4207
4208 +#ifndef CONFIG_PREEMPT_RT_BASE
4209         trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
4210 +#endif
4211
4212         i915_gem_execbuffer_move_to_active(vmas, params->request);
4213
4214 diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4215 index 6f10b421487b..dd3a9a6ace11 100644
4216 --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
4217 +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
4218 @@ -40,7 +40,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4219         if (!mutex_is_locked(mutex))
4220                 return false;
4221
4222 -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
4223 +#if (defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)) && !defined(CONFIG_PREEMPT_RT_BASE)
4224         return mutex->owner == task;
4225  #else
4226         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4227 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
4228 index 1c2aec392412..1d85c0c791f1 100644
4229 --- a/drivers/gpu/drm/i915/i915_irq.c
4230 +++ b/drivers/gpu/drm/i915/i915_irq.c
4231 @@ -812,6 +812,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4232         spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
4233
4234         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4235 +       preempt_disable_rt();
4236
4237         /* Get optional system timestamp before query. */
4238         if (stime)
4239 @@ -863,6 +864,7 @@ static int i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4240                 *etime = ktime_get();
4241
4242         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4243 +       preempt_enable_rt();
4244
4245         spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
4246
4247 diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
4248 index e9a64fba6333..2aac27b13d86 100644
4249 --- a/drivers/gpu/drm/i915/intel_display.c
4250 +++ b/drivers/gpu/drm/i915/intel_display.c
4251 @@ -11647,7 +11647,7 @@ void intel_check_page_flip(struct drm_i915_private *dev_priv, int pipe)
4252         struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
4253         struct intel_flip_work *work;
4254
4255 -       WARN_ON(!in_interrupt());
4256 +       WARN_ON_NONRT(!in_interrupt());
4257
4258         if (crtc == NULL)
4259                 return;
4260 diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
4261 index 4178849631ad..0eb939c92544 100644
4262 --- a/drivers/gpu/drm/i915/intel_sprite.c
4263 +++ b/drivers/gpu/drm/i915/intel_sprite.c
4264 @@ -38,6 +38,7 @@
4265  #include "intel_drv.h"
4266  #include <drm/i915_drm.h>
4267  #include "i915_drv.h"
4268 +#include <linux/locallock.h>
4269
4270  static bool
4271  format_is_yuv(uint32_t format)
4272 @@ -64,6 +65,8 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
4273                             1000 * adjusted_mode->crtc_htotal);
4274  }
4275
4276 +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock);
4277 +
4278  /**
4279   * intel_pipe_update_start() - start update of a set of display registers
4280   * @crtc: the crtc of which the registers are going to be updated
4281 @@ -94,7 +97,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4282         min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
4283         max = vblank_start - 1;
4284
4285 -       local_irq_disable();
4286 +       local_lock_irq(pipe_update_lock);
4287
4288         if (min <= 0 || max <= 0)
4289                 return;
4290 @@ -124,11 +127,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
4291                         break;
4292                 }
4293
4294 -               local_irq_enable();
4295 +               local_unlock_irq(pipe_update_lock);
4296
4297                 timeout = schedule_timeout(timeout);
4298
4299 -               local_irq_disable();
4300 +               local_lock_irq(pipe_update_lock);
4301         }
4302
4303         finish_wait(wq, &wait);
4304 @@ -180,7 +183,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc, struct intel_flip_work *work
4305                 crtc->base.state->event = NULL;
4306         }
4307
4308 -       local_irq_enable();
4309 +       local_unlock_irq(pipe_update_lock);
4310
4311         if (crtc->debug.start_vbl_count &&
4312             crtc->debug.start_vbl_count != end_vbl_count) {
4313 diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4314 index 283d2841ba58..d01f6ed1977e 100644
4315 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c
4316 +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c
4317 @@ -23,7 +23,7 @@ static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
4318         if (!mutex_is_locked(mutex))
4319                 return false;
4320
4321 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)
4322 +#if (defined(CONFIG_SMP) || defined(CONFIG_DEBUG_MUTEXES)) && !defined(CONFIG_PREEMPT_RT_BASE)
4323         return mutex->owner == task;
4324  #else
4325         /* Since UP may be pre-empted, we cannot assume that we own the lock */
4326 diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
4327 index c3206fb8f4cf..6e2423186e2a 100644
4328 --- a/drivers/gpu/drm/radeon/radeon_display.c
4329 +++ b/drivers/gpu/drm/radeon/radeon_display.c
4330 @@ -1869,6 +1869,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4331         struct radeon_device *rdev = dev->dev_private;
4332
4333         /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */
4334 +       preempt_disable_rt();
4335
4336         /* Get optional system timestamp before query. */
4337         if (stime)
4338 @@ -1961,6 +1962,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe,
4339                 *etime = ktime_get();
4340
4341         /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */
4342 +       preempt_enable_rt();
4343
4344         /* Decode into vertical and horizontal scanout position. */
4345         *vpos = position & 0x1fff;
4346 diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
4347 index e82f7e1c217c..b57d917b6ab7 100644
4348 --- a/drivers/hv/vmbus_drv.c
4349 +++ b/drivers/hv/vmbus_drv.c
4350 @@ -761,6 +761,8 @@ static void vmbus_isr(void)
4351         void *page_addr;
4352         struct hv_message *msg;
4353         union hv_synic_event_flags *event;
4354 +       struct pt_regs *regs = get_irq_regs();
4355 +       u64 ip = regs ? instruction_pointer(regs) : 0;
4356         bool handled = false;
4357
4358         page_addr = hv_context.synic_event_page[cpu];
4359 @@ -808,7 +810,7 @@ static void vmbus_isr(void)
4360                         tasklet_schedule(hv_context.msg_dpc[cpu]);
4361         }
4362
4363 -       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
4364 +       add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip);
4365  }
4366
4367
4368 diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
4369 index 36f76e28a0bf..394f142f90c7 100644
4370 --- a/drivers/ide/alim15x3.c
4371 +++ b/drivers/ide/alim15x3.c
4372 @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4373
4374         isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL);
4375
4376 -       local_irq_save(flags);
4377 +       local_irq_save_nort(flags);
4378
4379         if (m5229_revision < 0xC2) {
4380                 /*
4381 @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev)
4382         }
4383         pci_dev_put(north);
4384         pci_dev_put(isa_dev);
4385 -       local_irq_restore(flags);
4386 +       local_irq_restore_nort(flags);
4387         return 0;
4388  }
4389
4390 diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
4391 index 0ceae5cbd89a..c212e85d7f3e 100644
4392 --- a/drivers/ide/hpt366.c
4393 +++ b/drivers/ide/hpt366.c
4394 @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4395
4396         dma_old = inb(base + 2);
4397
4398 -       local_irq_save(flags);
4399 +       local_irq_save_nort(flags);
4400
4401         dma_new = dma_old;
4402         pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma);
4403 @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif,
4404         if (dma_new != dma_old)
4405                 outb(dma_new, base + 2);
4406
4407 -       local_irq_restore(flags);
4408 +       local_irq_restore_nort(flags);
4409
4410         printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
4411                          hwif->name, base, base + 7);
4412 diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c
4413 index 19763977568c..4169433faab5 100644
4414 --- a/drivers/ide/ide-io-std.c
4415 +++ b/drivers/ide/ide-io-std.c
4416 @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4417                 unsigned long uninitialized_var(flags);
4418
4419                 if ((io_32bit & 2) && !mmio) {
4420 -                       local_irq_save(flags);
4421 +                       local_irq_save_nort(flags);
4422                         ata_vlb_sync(io_ports->nsect_addr);
4423                 }
4424
4425 @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4426                         insl(data_addr, buf, words);
4427
4428                 if ((io_32bit & 2) && !mmio)
4429 -                       local_irq_restore(flags);
4430 +                       local_irq_restore_nort(flags);
4431
4432                 if (((len + 1) & 3) < 2)
4433                         return;
4434 @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4435                 unsigned long uninitialized_var(flags);
4436
4437                 if ((io_32bit & 2) && !mmio) {
4438 -                       local_irq_save(flags);
4439 +                       local_irq_save_nort(flags);
4440                         ata_vlb_sync(io_ports->nsect_addr);
4441                 }
4442
4443 @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf,
4444                         outsl(data_addr, buf, words);
4445
4446                 if ((io_32bit & 2) && !mmio)
4447 -                       local_irq_restore(flags);
4448 +                       local_irq_restore_nort(flags);
4449
4450                 if (((len + 1) & 3) < 2)
4451                         return;
4452 diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
4453 index 669ea1e45795..e12e43e62245 100644
4454 --- a/drivers/ide/ide-io.c
4455 +++ b/drivers/ide/ide-io.c
4456 @@ -659,7 +659,7 @@ void ide_timer_expiry (unsigned long data)
4457                 /* disable_irq_nosync ?? */
4458                 disable_irq(hwif->irq);
4459                 /* local CPU only, as if we were handling an interrupt */
4460 -               local_irq_disable();
4461 +               local_irq_disable_nort();
4462                 if (hwif->polling) {
4463                         startstop = handler(drive);
4464                 } else if (drive_is_ready(drive)) {
4465 diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
4466 index 376f2dc410c5..f014dd1b73dc 100644
4467 --- a/drivers/ide/ide-iops.c
4468 +++ b/drivers/ide/ide-iops.c
4469 @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad,
4470                                 if ((stat & ATA_BUSY) == 0)
4471                                         break;
4472
4473 -                               local_irq_restore(flags);
4474 +                               local_irq_restore_nort(flags);
4475                                 *rstat = stat;
4476                                 return -EBUSY;
4477                         }
4478                 }
4479 -               local_irq_restore(flags);
4480 +               local_irq_restore_nort(flags);
4481         }
4482         /*
4483          * Allow status to settle, then read it again.
4484 diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
4485 index 0b63facd1d87..4ceba37afc0c 100644
4486 --- a/drivers/ide/ide-probe.c
4487 +++ b/drivers/ide/ide-probe.c
4488 @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id)
4489         int bswap = 1;
4490
4491         /* local CPU only; some systems need this */
4492 -       local_irq_save(flags);
4493 +       local_irq_save_nort(flags);
4494         /* read 512 bytes of id info */
4495         hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE);
4496 -       local_irq_restore(flags);
4497 +       local_irq_restore_nort(flags);
4498
4499         drive->dev_flags |= IDE_DFLAG_ID_READ;
4500  #ifdef DEBUG
4501 diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
4502 index a716693417a3..be0568c722d6 100644
4503 --- a/drivers/ide/ide-taskfile.c
4504 +++ b/drivers/ide/ide-taskfile.c
4505 @@ -250,7 +250,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4506
4507                 page_is_high = PageHighMem(page);
4508                 if (page_is_high)
4509 -                       local_irq_save(flags);
4510 +                       local_irq_save_nort(flags);
4511
4512                 buf = kmap_atomic(page) + offset;
4513
4514 @@ -271,7 +271,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd,
4515                 kunmap_atomic(buf);
4516
4517                 if (page_is_high)
4518 -                       local_irq_restore(flags);
4519 +                       local_irq_restore_nort(flags);
4520
4521                 len -= nr_bytes;
4522         }
4523 @@ -414,7 +414,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive,
4524         }
4525
4526         if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0)
4527 -               local_irq_disable();
4528 +               local_irq_disable_nort();
4529
4530         ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE);
4531
4532 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4533 index d3394b6add24..506bfba6ec9f 100644
4534 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4535 +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
4536 @@ -897,7 +897,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4537
4538         ipoib_dbg_mcast(priv, "restarting multicast task\n");
4539
4540 -       local_irq_save(flags);
4541 +       local_irq_save_nort(flags);
4542         netif_addr_lock(dev);
4543         spin_lock(&priv->lock);
4544
4545 @@ -979,7 +979,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
4546
4547         spin_unlock(&priv->lock);
4548         netif_addr_unlock(dev);
4549 -       local_irq_restore(flags);
4550 +       local_irq_restore_nort(flags);
4551
4552         /*
4553          * make sure the in-flight joins have finished before we attempt
4554 diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
4555 index 4a2a9e370be7..e970d9afd179 100644
4556 --- a/drivers/input/gameport/gameport.c
4557 +++ b/drivers/input/gameport/gameport.c
4558 @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport)
4559         tx = ~0;
4560
4561         for (i = 0; i < 50; i++) {
4562 -               local_irq_save(flags);
4563 +               local_irq_save_nort(flags);
4564                 t1 = ktime_get_ns();
4565                 for (t = 0; t < 50; t++)
4566                         gameport_read(gameport);
4567                 t2 = ktime_get_ns();
4568                 t3 = ktime_get_ns();
4569 -               local_irq_restore(flags);
4570 +               local_irq_restore_nort(flags);
4571                 udelay(i * 10);
4572                 t = (t2 - t1) - (t3 - t2);
4573                 if (t < tx)
4574 @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4575         tx = 1 << 30;
4576
4577         for(i = 0; i < 50; i++) {
4578 -               local_irq_save(flags);
4579 +               local_irq_save_nort(flags);
4580                 GET_TIME(t1);
4581                 for (t = 0; t < 50; t++) gameport_read(gameport);
4582                 GET_TIME(t2);
4583                 GET_TIME(t3);
4584 -               local_irq_restore(flags);
4585 +               local_irq_restore_nort(flags);
4586                 udelay(i * 10);
4587                 if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
4588         }
4589 @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport)
4590         tx = 1 << 30;
4591
4592         for(i = 0; i < 50; i++) {
4593 -               local_irq_save(flags);
4594 +               local_irq_save_nort(flags);
4595                 t1 = rdtsc();
4596                 for (t = 0; t < 50; t++) gameport_read(gameport);
4597                 t2 = rdtsc();
4598 -               local_irq_restore(flags);
4599 +               local_irq_restore_nort(flags);
4600                 udelay(i * 10);
4601                 if (t2 - t1 < tx) tx = t2 - t1;
4602         }
4603 diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
4604 index 96de97a46079..a6ec875d941b 100644
4605 --- a/drivers/iommu/amd_iommu.c
4606 +++ b/drivers/iommu/amd_iommu.c
4607 @@ -1832,10 +1832,10 @@ static int __attach_device(struct iommu_dev_data *dev_data,
4608         int ret;
4609
4610         /*
4611 -        * Must be called with IRQs disabled. Warn here to detect early
4612 -        * when its not.
4613 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4614 +        * detect early when its not.
4615          */
4616 -       WARN_ON(!irqs_disabled());
4617 +       WARN_ON_NONRT(!irqs_disabled());
4618
4619         /* lock domain */
4620         spin_lock(&domain->lock);
4621 @@ -2003,10 +2003,10 @@ static void __detach_device(struct iommu_dev_data *dev_data)
4622         struct protection_domain *domain;
4623
4624         /*
4625 -        * Must be called with IRQs disabled. Warn here to detect early
4626 -        * when its not.
4627 +        * Must be called with IRQs disabled on a non RT kernel. Warn here to
4628 +        * detect early when its not.
4629          */
4630 -       WARN_ON(!irqs_disabled());
4631 +       WARN_ON_NONRT(!irqs_disabled());
4632
4633         if (WARN_ON(!dev_data->domain))
4634                 return;
4635 diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
4636 index ebb5bf3ddbd9..598f5df45f6b 100644
4637 --- a/drivers/iommu/intel-iommu.c
4638 +++ b/drivers/iommu/intel-iommu.c
4639 @@ -479,7 +479,7 @@ struct deferred_flush_data {
4640         struct deferred_flush_table *tables;
4641  };
4642
4643 -DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4644 +static DEFINE_PER_CPU(struct deferred_flush_data, deferred_flush);
4645
4646  /* bitmap for indexing intel_iommus */
4647  static int g_num_of_iommus;
4648 @@ -3626,10 +3626,8 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4649         struct intel_iommu *iommu;
4650         struct deferred_flush_entry *entry;
4651         struct deferred_flush_data *flush_data;
4652 -       unsigned int cpuid;
4653
4654 -       cpuid = get_cpu();
4655 -       flush_data = per_cpu_ptr(&deferred_flush, cpuid);
4656 +       flush_data = raw_cpu_ptr(&deferred_flush);
4657
4658         /* Flush all CPUs' entries to avoid deferring too much.  If
4659          * this becomes a bottleneck, can just flush us, and rely on
4660 @@ -3662,8 +3660,6 @@ static void add_unmap(struct dmar_domain *dom, unsigned long iova_pfn,
4661         }
4662         flush_data->size++;
4663         spin_unlock_irqrestore(&flush_data->lock, flags);
4664 -
4665 -       put_cpu();
4666  }
4667
4668  static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
4669 diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
4670 index e23001bfcfee..359d5d169ec0 100644
4671 --- a/drivers/iommu/iova.c
4672 +++ b/drivers/iommu/iova.c
4673 @@ -22,6 +22,7 @@
4674  #include <linux/slab.h>
4675  #include <linux/smp.h>
4676  #include <linux/bitops.h>
4677 +#include <linux/cpu.h>
4678
4679  static bool iova_rcache_insert(struct iova_domain *iovad,
4680                                unsigned long pfn,
4681 @@ -420,10 +421,8 @@ alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
4682
4683                 /* Try replenishing IOVAs by flushing rcache. */
4684                 flushed_rcache = true;
4685 -               preempt_disable();
4686                 for_each_online_cpu(cpu)
4687                         free_cpu_cached_iovas(cpu, iovad);
4688 -               preempt_enable();
4689                 goto retry;
4690         }
4691
4692 @@ -751,7 +750,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4693         bool can_insert = false;
4694         unsigned long flags;
4695
4696 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4697 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4698         spin_lock_irqsave(&cpu_rcache->lock, flags);
4699
4700         if (!iova_magazine_full(cpu_rcache->loaded)) {
4701 @@ -781,7 +780,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
4702                 iova_magazine_push(cpu_rcache->loaded, iova_pfn);
4703
4704         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4705 -       put_cpu_ptr(rcache->cpu_rcaches);
4706
4707         if (mag_to_free) {
4708                 iova_magazine_free_pfns(mag_to_free, iovad);
4709 @@ -815,7 +813,7 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4710         bool has_pfn = false;
4711         unsigned long flags;
4712
4713 -       cpu_rcache = get_cpu_ptr(rcache->cpu_rcaches);
4714 +       cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
4715         spin_lock_irqsave(&cpu_rcache->lock, flags);
4716
4717         if (!iova_magazine_empty(cpu_rcache->loaded)) {
4718 @@ -837,7 +835,6 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
4719                 iova_pfn = iova_magazine_pop(cpu_rcache->loaded, limit_pfn);
4720
4721         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
4722 -       put_cpu_ptr(rcache->cpu_rcaches);
4723
4724         return iova_pfn;
4725  }
4726 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
4727 index 3f9ddb9fafa7..09da5b6b44a1 100644
4728 --- a/drivers/leds/trigger/Kconfig
4729 +++ b/drivers/leds/trigger/Kconfig
4730 @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT
4731
4732  config LEDS_TRIGGER_CPU
4733         bool "LED CPU Trigger"
4734 -       depends on LEDS_TRIGGERS
4735 +       depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE
4736         help
4737           This allows LEDs to be controlled by active CPUs. This shows
4738           the active CPUs across an array of LEDs so you can see which
4739 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
4740 index 4d200883c505..98b64ed5cb81 100644
4741 --- a/drivers/md/bcache/Kconfig
4742 +++ b/drivers/md/bcache/Kconfig
4743 @@ -1,6 +1,7 @@
4744
4745  config BCACHE
4746         tristate "Block device as cache"
4747 +       depends on !PREEMPT_RT_FULL
4748         ---help---
4749         Allows a block device to be used as cache for other devices; uses
4750         a btree for indexing and the layout is optimized for SSDs.
4751 diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
4752 index 5da86c8b6545..2aa092f2977e 100644
4753 --- a/drivers/md/dm-rq.c
4754 +++ b/drivers/md/dm-rq.c
4755 @@ -811,7 +811,7 @@ static void dm_old_request_fn(struct request_queue *q)
4756                 /* Establish tio->ti before queuing work (map_tio_request) */
4757                 tio->ti = ti;
4758                 queue_kthread_work(&md->kworker, &tio->work);
4759 -               BUG_ON(!irqs_disabled());
4760 +               BUG_ON_NONRT(!irqs_disabled());
4761         }
4762  }
4763
4764 diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
4765 index ee7fc3701700..ae59c9e13911 100644
4766 --- a/drivers/md/raid5.c
4767 +++ b/drivers/md/raid5.c
4768 @@ -1928,8 +1928,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4769         struct raid5_percpu *percpu;
4770         unsigned long cpu;
4771
4772 -       cpu = get_cpu();
4773 +       cpu = get_cpu_light();
4774         percpu = per_cpu_ptr(conf->percpu, cpu);
4775 +       spin_lock(&percpu->lock);
4776         if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
4777                 ops_run_biofill(sh);
4778                 overlap_clear++;
4779 @@ -1985,7 +1986,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
4780                         if (test_and_clear_bit(R5_Overlap, &dev->flags))
4781                                 wake_up(&sh->raid_conf->wait_for_overlap);
4782                 }
4783 -       put_cpu();
4784 +       spin_unlock(&percpu->lock);
4785 +       put_cpu_light();
4786  }
4787
4788  static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
4789 @@ -6438,6 +6440,7 @@ static int raid5_alloc_percpu(struct r5conf *conf)
4790                                __func__, cpu);
4791                         break;
4792                 }
4793 +               spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock);
4794         }
4795         put_online_cpus();
4796
4797 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
4798 index 517d4b68a1be..efe91887ecd7 100644
4799 --- a/drivers/md/raid5.h
4800 +++ b/drivers/md/raid5.h
4801 @@ -504,6 +504,7 @@ struct r5conf {
4802         int                     recovery_disabled;
4803         /* per cpu variables */
4804         struct raid5_percpu {
4805 +               spinlock_t      lock;           /* Protection for -RT */
4806                 struct page     *spare_page; /* Used when checking P/Q in raid6 */
4807                 struct flex_array *scribble;   /* space for constructing buffer
4808                                               * lists and performing address
4809 diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
4810 index d00252828966..9faab404faac 100644
4811 --- a/drivers/misc/Kconfig
4812 +++ b/drivers/misc/Kconfig
4813 @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI
4814  config ATMEL_TCLIB
4815         bool "Atmel AT32/AT91 Timer/Counter Library"
4816         depends on (AVR32 || ARCH_AT91)
4817 +       default y if PREEMPT_RT_FULL
4818         help
4819           Select this if you want a library to allocate the Timer/Counter
4820           blocks found on many Atmel processors.  This facilitates using
4821 @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC
4822           are combined to make a single 32-bit timer.
4823
4824           When GENERIC_CLOCKEVENTS is defined, the third timer channel
4825 -         may be used as a clock event device supporting oneshot mode
4826 -         (delays of up to two seconds) based on the 32 KiHz clock.
4827 +         may be used as a clock event device supporting oneshot mode.
4828
4829  config ATMEL_TCB_CLKSRC_BLOCK
4830         int
4831 @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK
4832           TC can be used for other purposes, such as PWM generation and
4833           interval timing.
4834
4835 +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK
4836 +       bool "TC Block use 32 KiHz clock"
4837 +       depends on ATMEL_TCB_CLKSRC
4838 +       default y if !PREEMPT_RT_FULL
4839 +       help
4840 +         Select this to use 32 KiHz base clock rate as TC block clock
4841 +         source for clock events.
4842 +
4843 +
4844  config DUMMY_IRQ
4845         tristate "Dummy IRQ handler"
4846         default n
4847 @@ -114,6 +123,35 @@ config IBM_ASM
4848           for information on the specific driver level and support statement
4849           for your IBM server.
4850
4851 +config HWLAT_DETECTOR
4852 +       tristate "Testing module to detect hardware-induced latencies"
4853 +       depends on DEBUG_FS
4854 +       depends on RING_BUFFER
4855 +       default m
4856 +       ---help---
4857 +         A simple hardware latency detector. Use this module to detect
4858 +         large latencies introduced by the behavior of the underlying
4859 +         system firmware external to Linux. We do this using periodic
4860 +         use of stop_machine to grab all available CPUs and measure
4861 +         for unexplainable gaps in the CPU timestamp counter(s). By
4862 +         default, the module is not enabled until the "enable" file
4863 +         within the "hwlat_detector" debugfs directory is toggled.
4864 +
4865 +         This module is often used to detect SMI (System Management
4866 +         Interrupts) on x86 systems, though is not x86 specific. To
4867 +         this end, we default to using a sample window of 1 second,
4868 +         during which we will sample for 0.5 seconds. If an SMI or
4869 +         similar event occurs during that time, it is recorded
4870 +         into an 8K samples global ring buffer until retreived.
4871 +
4872 +         WARNING: This software should never be enabled (it can be built
4873 +         but should not be turned on after it is loaded) in a production
4874 +         environment where high latencies are a concern since the
4875 +         sampling mechanism actually introduces latencies for
4876 +         regular tasks while the CPU(s) are being held.
4877 +
4878 +         If unsure, say N
4879 +
4880  config PHANTOM
4881         tristate "Sensable PHANToM (PCI)"
4882         depends on PCI
4883 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
4884 index fb32516ddfe2..8643df9af3c4 100644
4885 --- a/drivers/misc/Makefile
4886 +++ b/drivers/misc/Makefile
4887 @@ -38,6 +38,7 @@ obj-$(CONFIG_C2PORT)          += c2port/
4888  obj-$(CONFIG_HMC6352)          += hmc6352.o
4889  obj-y                          += eeprom/
4890  obj-y                          += cb710/
4891 +obj-$(CONFIG_HWLAT_DETECTOR)   += hwlat_detector.o
4892  obj-$(CONFIG_SPEAR13XX_PCIE_GADGET)    += spear13xx_pcie_gadget.o
4893  obj-$(CONFIG_VMWARE_BALLOON)   += vmw_balloon.o
4894  obj-$(CONFIG_ARM_CHARLCD)      += arm-charlcd.o
4895 diff --git a/drivers/misc/hwlat_detector.c b/drivers/misc/hwlat_detector.c
4896 new file mode 100644
4897 index 000000000000..52f5ad5fd9c0
4898 --- /dev/null
4899 +++ b/drivers/misc/hwlat_detector.c
4900 @@ -0,0 +1,1240 @@
4901 +/*
4902 + * hwlat_detector.c - A simple Hardware Latency detector.
4903 + *
4904 + * Use this module to detect large system latencies induced by the behavior of
4905 + * certain underlying system hardware or firmware, independent of Linux itself.
4906 + * The code was developed originally to detect the presence of SMIs on Intel
4907 + * and AMD systems, although there is no dependency upon x86 herein.
4908 + *
4909 + * The classical example usage of this module is in detecting the presence of
4910 + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a
4911 + * somewhat special form of hardware interrupt spawned from earlier CPU debug
4912 + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge
4913 + * LPC (or other device) to generate a special interrupt under certain
4914 + * circumstances, for example, upon expiration of a special SMI timer device,
4915 + * due to certain external thermal readings, on certain I/O address accesses,
4916 + * and other situations. An SMI hits a special CPU pin, triggers a special
4917 + * SMI mode (complete with special memory map), and the OS is unaware.
4918 + *
4919 + * Although certain hardware-inducing latencies are necessary (for example,
4920 + * a modern system often requires an SMI handler for correct thermal control
4921 + * and remote management) they can wreak havoc upon any OS-level performance
4922 + * guarantees toward low-latency, especially when the OS is not even made
4923 + * aware of the presence of these interrupts. For this reason, we need a
4924 + * somewhat brute force mechanism to detect these interrupts. In this case,
4925 + * we do it by hogging all of the CPU(s) for configurable timer intervals,
4926 + * sampling the built-in CPU timer, looking for discontiguous readings.
4927 + *
4928 + * WARNING: This implementation necessarily introduces latencies. Therefore,
4929 + *          you should NEVER use this module in a production environment
4930 + *          requiring any kind of low-latency performance guarantee(s).
4931 + *
4932 + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
4933 + *
4934 + * Includes useful feedback from Clark Williams <clark@redhat.com>
4935 + *
4936 + * This file is licensed under the terms of the GNU General Public
4937 + * License version 2. This program is licensed "as is" without any
4938 + * warranty of any kind, whether express or implied.
4939 + */
4940 +
4941 +#include <linux/module.h>
4942 +#include <linux/init.h>
4943 +#include <linux/ring_buffer.h>
4944 +#include <linux/time.h>
4945 +#include <linux/hrtimer.h>
4946 +#include <linux/kthread.h>
4947 +#include <linux/debugfs.h>
4948 +#include <linux/seq_file.h>
4949 +#include <linux/uaccess.h>
4950 +#include <linux/version.h>
4951 +#include <linux/delay.h>
4952 +#include <linux/slab.h>
4953 +#include <linux/trace_clock.h>
4954 +
4955 +#define BUF_SIZE_DEFAULT       262144UL                /* 8K*(sizeof(entry)) */
4956 +#define BUF_FLAGS              (RB_FL_OVERWRITE)       /* no block on full */
4957 +#define U64STR_SIZE            22                      /* 20 digits max */
4958 +
4959 +#define VERSION                        "1.0.0"
4960 +#define BANNER                 "hwlat_detector: "
4961 +#define DRVNAME                        "hwlat_detector"
4962 +#define DEFAULT_SAMPLE_WINDOW  1000000                 /* 1s */
4963 +#define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
4964 +#define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
4965 +
4966 +/* Module metadata */
4967 +
4968 +MODULE_LICENSE("GPL");
4969 +MODULE_AUTHOR("Jon Masters <jcm@redhat.com>");
4970 +MODULE_DESCRIPTION("A simple hardware latency detector");
4971 +MODULE_VERSION(VERSION);
4972 +
4973 +/* Module parameters */
4974 +
4975 +static int debug;
4976 +static int enabled;
4977 +static int threshold;
4978 +
4979 +module_param(debug, int, 0);                   /* enable debug */
4980 +module_param(enabled, int, 0);                 /* enable detector */
4981 +module_param(threshold, int, 0);               /* latency threshold */
4982 +
4983 +/* Buffering and sampling */
4984 +
4985 +static struct ring_buffer *ring_buffer;                /* sample buffer */
4986 +static DEFINE_MUTEX(ring_buffer_mutex);                /* lock changes */
4987 +static unsigned long buf_size = BUF_SIZE_DEFAULT;
4988 +static struct task_struct *kthread;            /* sampling thread */
4989 +
4990 +/* DebugFS filesystem entries */
4991 +
4992 +static struct dentry *debug_dir;               /* debugfs directory */
4993 +static struct dentry *debug_max;               /* maximum TSC delta */
4994 +static struct dentry *debug_count;             /* total detect count */
4995 +static struct dentry *debug_sample_width;      /* sample width us */
4996 +static struct dentry *debug_sample_window;     /* sample window us */
4997 +static struct dentry *debug_sample;            /* raw samples us */
4998 +static struct dentry *debug_threshold;         /* threshold us */
4999 +static struct dentry *debug_enable;            /* enable/disable */
5000 +
5001 +/* Individual samples and global state */
5002 +
5003 +struct sample;                                 /* latency sample */
5004 +struct data;                                   /* Global state */
5005 +
5006 +/* Sampling functions */
5007 +static int __buffer_add_sample(struct sample *sample);
5008 +static struct sample *buffer_get_sample(struct sample *sample);
5009 +
5010 +/* Threading and state */
5011 +static int kthread_fn(void *unused);
5012 +static int start_kthread(void);
5013 +static int stop_kthread(void);
5014 +static void __reset_stats(void);
5015 +static int init_stats(void);
5016 +
5017 +/* Debugfs interface */
5018 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
5019 +                               size_t cnt, loff_t *ppos, const u64 *entry);
5020 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
5021 +                                size_t cnt, loff_t *ppos, u64 *entry);
5022 +static int debug_sample_fopen(struct inode *inode, struct file *filp);
5023 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
5024 +                                 size_t cnt, loff_t *ppos);
5025 +static int debug_sample_release(struct inode *inode, struct file *filp);
5026 +static int debug_enable_fopen(struct inode *inode, struct file *filp);
5027 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
5028 +                                 size_t cnt, loff_t *ppos);
5029 +static ssize_t debug_enable_fwrite(struct file *file,
5030 +                                  const char __user *user_buffer,
5031 +                                  size_t user_size, loff_t *offset);
5032 +
5033 +/* Initialization functions */
5034 +static int init_debugfs(void);
5035 +static void free_debugfs(void);
5036 +static int detector_init(void);
5037 +static void detector_exit(void);
5038 +
5039 +/* Individual latency samples are stored here when detected and packed into
5040 + * the ring_buffer circular buffer, where they are overwritten when
5041 + * more than buf_size/sizeof(sample) samples are received. */
5042 +struct sample {
5043 +       u64             seqnum;         /* unique sequence */
5044 +       u64             duration;       /* ktime delta */
5045 +       u64             outer_duration; /* ktime delta (outer loop) */
5046 +       struct timespec timestamp;      /* wall time */
5047 +       unsigned long   lost;
5048 +};
5049 +
5050 +/* keep the global state somewhere. */
5051 +static struct data {
5052 +
5053 +       struct mutex lock;              /* protect changes */
5054 +
5055 +       u64     count;                  /* total since reset */
5056 +       u64     max_sample;             /* max hardware latency */
5057 +       u64     threshold;              /* sample threshold level */
5058 +
5059 +       u64     sample_window;          /* total sampling window (on+off) */
5060 +       u64     sample_width;           /* active sampling portion of window */
5061 +
5062 +       atomic_t sample_open;           /* whether the sample file is open */
5063 +
5064 +       wait_queue_head_t wq;           /* waitqeue for new sample values */
5065 +
5066 +} data;
5067 +
5068 +/**
5069 + * __buffer_add_sample - add a new latency sample recording to the ring buffer
5070 + * @sample: The new latency sample value
5071 + *
5072 + * This receives a new latency sample and records it in a global ring buffer.
5073 + * No additional locking is used in this case.
5074 + */
5075 +static int __buffer_add_sample(struct sample *sample)
5076 +{
5077 +       return ring_buffer_write(ring_buffer,
5078 +                                sizeof(struct sample), sample);
5079 +}
5080 +
5081 +/**
5082 + * buffer_get_sample - remove a hardware latency sample from the ring buffer
5083 + * @sample: Pre-allocated storage for the sample
5084 + *
5085 + * This retrieves a hardware latency sample from the global circular buffer
5086 + */
5087 +static struct sample *buffer_get_sample(struct sample *sample)
5088 +{
5089 +       struct ring_buffer_event *e = NULL;
5090 +       struct sample *s = NULL;
5091 +       unsigned int cpu = 0;
5092 +
5093 +       if (!sample)
5094 +               return NULL;
5095 +
5096 +       mutex_lock(&ring_buffer_mutex);
5097 +       for_each_online_cpu(cpu) {
5098 +               e = ring_buffer_consume(ring_buffer, cpu, NULL, &sample->lost);
5099 +               if (e)
5100 +                       break;
5101 +       }
5102 +
5103 +       if (e) {
5104 +               s = ring_buffer_event_data(e);
5105 +               memcpy(sample, s, sizeof(struct sample));
5106 +       } else
5107 +               sample = NULL;
5108 +       mutex_unlock(&ring_buffer_mutex);
5109 +
5110 +       return sample;
5111 +}
5112 +
5113 +#ifndef CONFIG_TRACING
5114 +#define time_type      ktime_t
5115 +#define time_get()     ktime_get()
5116 +#define time_to_us(x)  ktime_to_us(x)
5117 +#define time_sub(a, b) ktime_sub(a, b)
5118 +#define init_time(a, b)        (a).tv64 = b
5119 +#define time_u64(a)    ((a).tv64)
5120 +#else
5121 +#define time_type      u64
5122 +#define time_get()     trace_clock_local()
5123 +#define time_to_us(x)  div_u64(x, 1000)
5124 +#define time_sub(a, b) ((a) - (b))
5125 +#define init_time(a, b)        (a = b)
5126 +#define time_u64(a)    a
5127 +#endif
5128 +/**
5129 + * get_sample - sample the CPU TSC and look for likely hardware latencies
5130 + *
5131 + * Used to repeatedly capture the CPU TSC (or similar), looking for potential
5132 + * hardware-induced latency. Called with interrupts disabled and with
5133 + * data.lock held.
5134 + */
5135 +static int get_sample(void)
5136 +{
5137 +       time_type start, t1, t2, last_t2;
5138 +       s64 diff, total = 0;
5139 +       u64 sample = 0;
5140 +       u64 outer_sample = 0;
5141 +       int ret = -1;
5142 +
5143 +       init_time(last_t2, 0);
5144 +       start = time_get(); /* start timestamp */
5145 +
5146 +       do {
5147 +
5148 +               t1 = time_get();        /* we'll look for a discontinuity */
5149 +               t2 = time_get();
5150 +
5151 +               if (time_u64(last_t2)) {
5152 +                       /* Check the delta from outer loop (t2 to next t1) */
5153 +                       diff = time_to_us(time_sub(t1, last_t2));
5154 +                       /* This shouldn't happen */
5155 +                       if (diff < 0) {
5156 +                               pr_err(BANNER "time running backwards\n");
5157 +                               goto out;
5158 +                       }
5159 +                       if (diff > outer_sample)
5160 +                               outer_sample = diff;
5161 +               }
5162 +               last_t2 = t2;
5163 +
5164 +               total = time_to_us(time_sub(t2, start)); /* sample width */
5165 +
5166 +               /* This checks the inner loop (t1 to t2) */
5167 +               diff = time_to_us(time_sub(t2, t1));     /* current diff */
5168 +
5169 +               /* This shouldn't happen */
5170 +               if (diff < 0) {
5171 +                       pr_err(BANNER "time running backwards\n");
5172 +                       goto out;
5173 +               }
5174 +
5175 +               if (diff > sample)
5176 +                       sample = diff; /* only want highest value */
5177 +
5178 +       } while (total <= data.sample_width);
5179 +
5180 +       ret = 0;
5181 +
5182 +       /* If we exceed the threshold value, we have found a hardware latency */
5183 +       if (sample > data.threshold || outer_sample > data.threshold) {
5184 +               struct sample s;
5185 +
5186 +               ret = 1;
5187 +
5188 +               data.count++;
5189 +               s.seqnum = data.count;
5190 +               s.duration = sample;
5191 +               s.outer_duration = outer_sample;
5192 +               s.timestamp = CURRENT_TIME;
5193 +               __buffer_add_sample(&s);
5194 +
5195 +               /* Keep a running maximum ever recorded hardware latency */
5196 +               if (sample > data.max_sample)
5197 +                       data.max_sample = sample;
5198 +       }
5199 +
5200 +out:
5201 +       return ret;
5202 +}
5203 +
5204 +/*
5205 + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread
5206 + * @unused: A required part of the kthread API.
5207 + *
5208 + * Used to periodically sample the CPU TSC via a call to get_sample. We
5209 + * disable interrupts, which does (intentionally) introduce latency since we
5210 + * need to ensure nothing else might be running (and thus pre-empting).
5211 + * Obviously this should never be used in production environments.
5212 + *
5213 + * Currently this runs on which ever CPU it was scheduled on, but most
5214 + * real-worald hardware latency situations occur across several CPUs,
5215 + * but we might later generalize this if we find there are any actualy
5216 + * systems with alternate SMI delivery or other hardware latencies.
5217 + */
5218 +static int kthread_fn(void *unused)
5219 +{
5220 +       int ret;
5221 +       u64 interval;
5222 +
5223 +       while (!kthread_should_stop()) {
5224 +
5225 +               mutex_lock(&data.lock);
5226 +
5227 +               local_irq_disable();
5228 +               ret = get_sample();
5229 +               local_irq_enable();
5230 +
5231 +               if (ret > 0)
5232 +                       wake_up(&data.wq); /* wake up reader(s) */
5233 +
5234 +               interval = data.sample_window - data.sample_width;
5235 +               do_div(interval, USEC_PER_MSEC); /* modifies interval value */
5236 +
5237 +               mutex_unlock(&data.lock);
5238 +
5239 +               if (msleep_interruptible(interval))
5240 +                       break;
5241 +       }
5242 +
5243 +       return 0;
5244 +}
5245 +
5246 +/**
5247 + * start_kthread - Kick off the hardware latency sampling/detector kthread
5248 + *
5249 + * This starts a kernel thread that will sit and sample the CPU timestamp
5250 + * counter (TSC or similar) and look for potential hardware latencies.
5251 + */
5252 +static int start_kthread(void)
5253 +{
5254 +       kthread = kthread_run(kthread_fn, NULL,
5255 +                                       DRVNAME);
5256 +       if (IS_ERR(kthread)) {
5257 +               pr_err(BANNER "could not start sampling thread\n");
5258 +               enabled = 0;
5259 +               return -ENOMEM;
5260 +       }
5261 +
5262 +       return 0;
5263 +}
5264 +
5265 +/**
5266 + * stop_kthread - Inform the hardware latency samping/detector kthread to stop
5267 + *
5268 + * This kicks the running hardware latency sampling/detector kernel thread and
5269 + * tells it to stop sampling now. Use this on unload and at system shutdown.
5270 + */
5271 +static int stop_kthread(void)
5272 +{
5273 +       int ret;
5274 +
5275 +       ret = kthread_stop(kthread);
5276 +
5277 +       return ret;
5278 +}
5279 +
5280 +/**
5281 + * __reset_stats - Reset statistics for the hardware latency detector
5282 + *
5283 + * We use data to store various statistics and global state. We call this
5284 + * function in order to reset those when "enable" is toggled on or off, and
5285 + * also at initialization. Should be called with data.lock held.
5286 + */
5287 +static void __reset_stats(void)
5288 +{
5289 +       data.count = 0;
5290 +       data.max_sample = 0;
5291 +       ring_buffer_reset(ring_buffer); /* flush out old sample entries */
5292 +}
5293 +
5294 +/**
5295 + * init_stats - Setup global state statistics for the hardware latency detector
5296 + *
5297 + * We use data to store various statistics and global state. We also use
5298 + * a global ring buffer (ring_buffer) to keep raw samples of detected hardware
5299 + * induced system latencies. This function initializes these structures and
5300 + * allocates the global ring buffer also.
5301 + */
5302 +static int init_stats(void)
5303 +{
5304 +       int ret = -ENOMEM;
5305 +
5306 +       mutex_init(&data.lock);
5307 +       init_waitqueue_head(&data.wq);
5308 +       atomic_set(&data.sample_open, 0);
5309 +
5310 +       ring_buffer = ring_buffer_alloc(buf_size, BUF_FLAGS);
5311 +
5312 +       if (WARN(!ring_buffer, KERN_ERR BANNER
5313 +                              "failed to allocate ring buffer!\n"))
5314 +               goto out;
5315 +
5316 +       __reset_stats();
5317 +       data.threshold = threshold ?: DEFAULT_LAT_THRESHOLD; /* threshold us */
5318 +       data.sample_window = DEFAULT_SAMPLE_WINDOW; /* window us */
5319 +       data.sample_width = DEFAULT_SAMPLE_WIDTH;   /* width us */
5320 +
5321 +       ret = 0;
5322 +
5323 +out:
5324 +       return ret;
5325 +
5326 +}
5327 +
5328 +/*
5329 + * simple_data_read - Wrapper read function for global state debugfs entries
5330 + * @filp: The active open file structure for the debugfs "file"
5331 + * @ubuf: The userspace provided buffer to read value into
5332 + * @cnt: The maximum number of bytes to read
5333 + * @ppos: The current "file" position
5334 + * @entry: The entry to read from
5335 + *
5336 + * This function provides a generic read implementation for the global state
5337 + * "data" structure debugfs filesystem entries. It would be nice to use
5338 + * simple_attr_read directly, but we need to make sure that the data.lock
5339 + * is held during the actual read.
5340 + */
5341 +static ssize_t simple_data_read(struct file *filp, char __user *ubuf,
5342 +                               size_t cnt, loff_t *ppos, const u64 *entry)
5343 +{
5344 +       char buf[U64STR_SIZE];
5345 +       u64 val = 0;
5346 +       int len = 0;
5347 +
5348 +       memset(buf, 0, sizeof(buf));
5349 +
5350 +       if (!entry)
5351 +               return -EFAULT;
5352 +
5353 +       mutex_lock(&data.lock);
5354 +       val = *entry;
5355 +       mutex_unlock(&data.lock);
5356 +
5357 +       len = snprintf(buf, sizeof(buf), "%llu\n", (unsigned long long)val);
5358 +
5359 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
5360 +
5361 +}
5362 +
5363 +/*
5364 + * simple_data_write - Wrapper write function for global state debugfs entries
5365 + * @filp: The active open file structure for the debugfs "file"
5366 + * @ubuf: The userspace provided buffer to write value from
5367 + * @cnt: The maximum number of bytes to write
5368 + * @ppos: The current "file" position
5369 + * @entry: The entry to write to
5370 + *
5371 + * This function provides a generic write implementation for the global state
5372 + * "data" structure debugfs filesystem entries. It would be nice to use
5373 + * simple_attr_write directly, but we need to make sure that the data.lock
5374 + * is held during the actual write.
5375 + */
5376 +static ssize_t simple_data_write(struct file *filp, const char __user *ubuf,
5377 +                                size_t cnt, loff_t *ppos, u64 *entry)
5378 +{
5379 +       char buf[U64STR_SIZE];
5380 +       int csize = min(cnt, sizeof(buf));
5381 +       u64 val = 0;
5382 +       int err = 0;
5383 +
5384 +       memset(buf, '\0', sizeof(buf));
5385 +       if (copy_from_user(buf, ubuf, csize))
5386 +               return -EFAULT;
5387 +
5388 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
5389 +       err = kstrtoull(buf, 10, &val);
5390 +       if (err)
5391 +               return -EINVAL;
5392 +
5393 +       mutex_lock(&data.lock);
5394 +       *entry = val;
5395 +       mutex_unlock(&data.lock);
5396 +
5397 +       return csize;
5398 +}
5399 +
5400 +/**
5401 + * debug_count_fopen - Open function for "count" debugfs entry
5402 + * @inode: The in-kernel inode representation of the debugfs "file"
5403 + * @filp: The active open file structure for the debugfs "file"
5404 + *
5405 + * This function provides an open implementation for the "count" debugfs
5406 + * interface to the hardware latency detector.
5407 + */
5408 +static int debug_count_fopen(struct inode *inode, struct file *filp)
5409 +{
5410 +       return 0;
5411 +}
5412 +
5413 +/**
5414 + * debug_count_fread - Read function for "count" debugfs entry
5415 + * @filp: The active open file structure for the debugfs "file"
5416 + * @ubuf: The userspace provided buffer to read value into
5417 + * @cnt: The maximum number of bytes to read
5418 + * @ppos: The current "file" position
5419 + *
5420 + * This function provides a read implementation for the "count" debugfs
5421 + * interface to the hardware latency detector. Can be used to read the
5422 + * number of latency readings exceeding the configured threshold since
5423 + * the detector was last reset (e.g. by writing a zero into "count").
5424 + */
5425 +static ssize_t debug_count_fread(struct file *filp, char __user *ubuf,
5426 +                                    size_t cnt, loff_t *ppos)
5427 +{
5428 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.count);
5429 +}
5430 +
5431 +/**
5432 + * debug_count_fwrite - Write function for "count" debugfs entry
5433 + * @filp: The active open file structure for the debugfs "file"
5434 + * @ubuf: The user buffer that contains the value to write
5435 + * @cnt: The maximum number of bytes to write to "file"
5436 + * @ppos: The current position in the debugfs "file"
5437 + *
5438 + * This function provides a write implementation for the "count" debugfs
5439 + * interface to the hardware latency detector. Can be used to write a
5440 + * desired value, especially to zero the total count.
5441 + */
5442 +static ssize_t  debug_count_fwrite(struct file *filp,
5443 +                                      const char __user *ubuf,
5444 +                                      size_t cnt,
5445 +                                      loff_t *ppos)
5446 +{
5447 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.count);
5448 +}
5449 +
5450 +/**
5451 + * debug_enable_fopen - Dummy open function for "enable" debugfs interface
5452 + * @inode: The in-kernel inode representation of the debugfs "file"
5453 + * @filp: The active open file structure for the debugfs "file"
5454 + *
5455 + * This function provides an open implementation for the "enable" debugfs
5456 + * interface to the hardware latency detector.
5457 + */
5458 +static int debug_enable_fopen(struct inode *inode, struct file *filp)
5459 +{
5460 +       return 0;
5461 +}
5462 +
5463 +/**
5464 + * debug_enable_fread - Read function for "enable" debugfs interface
5465 + * @filp: The active open file structure for the debugfs "file"
5466 + * @ubuf: The userspace provided buffer to read value into
5467 + * @cnt: The maximum number of bytes to read
5468 + * @ppos: The current "file" position
5469 + *
5470 + * This function provides a read implementation for the "enable" debugfs
5471 + * interface to the hardware latency detector. Can be used to determine
5472 + * whether the detector is currently enabled ("0\n" or "1\n" returned).
5473 + */
5474 +static ssize_t debug_enable_fread(struct file *filp, char __user *ubuf,
5475 +                                     size_t cnt, loff_t *ppos)
5476 +{
5477 +       char buf[4];
5478 +
5479 +       if ((cnt < sizeof(buf)) || (*ppos))
5480 +               return 0;
5481 +
5482 +       buf[0] = enabled ? '1' : '0';
5483 +       buf[1] = '\n';
5484 +       buf[2] = '\0';
5485 +       if (copy_to_user(ubuf, buf, strlen(buf)))
5486 +               return -EFAULT;
5487 +       return *ppos = strlen(buf);
5488 +}
5489 +
5490 +/**
5491 + * debug_enable_fwrite - Write function for "enable" debugfs interface
5492 + * @filp: The active open file structure for the debugfs "file"
5493 + * @ubuf: The user buffer that contains the value to write
5494 + * @cnt: The maximum number of bytes to write to "file"
5495 + * @ppos: The current position in the debugfs "file"
5496 + *
5497 + * This function provides a write implementation for the "enable" debugfs
5498 + * interface to the hardware latency detector. Can be used to enable or
5499 + * disable the detector, which will have the side-effect of possibly
5500 + * also resetting the global stats and kicking off the measuring
5501 + * kthread (on an enable) or the converse (upon a disable).
5502 + */
5503 +static ssize_t  debug_enable_fwrite(struct file *filp,
5504 +                                       const char __user *ubuf,
5505 +                                       size_t cnt,
5506 +                                       loff_t *ppos)
5507 +{
5508 +       char buf[4];
5509 +       int csize = min(cnt, sizeof(buf));
5510 +       long val = 0;
5511 +       int err = 0;
5512 +
5513 +       memset(buf, '\0', sizeof(buf));
5514 +       if (copy_from_user(buf, ubuf, csize))
5515 +               return -EFAULT;
5516 +
5517 +       buf[sizeof(buf)-1] = '\0';                      /* just in case */
5518 +       err = kstrtoul(buf, 10, &val);
5519 +       if (err)
5520 +               return -EINVAL;
5521 +
5522 +       if (val) {
5523 +               if (enabled)
5524 +                       goto unlock;
5525 +               enabled = 1;
5526 +               __reset_stats();
5527 +               if (start_kthread())
5528 +                       return -EFAULT;
5529 +       } else {
5530 +               if (!enabled)
5531 +                       goto unlock;
5532 +               enabled = 0;
5533 +               err = stop_kthread();
5534 +               if (err) {
5535 +                       pr_err(BANNER "cannot stop kthread\n");
5536 +                       return -EFAULT;
5537 +               }
5538 +               wake_up(&data.wq);              /* reader(s) should return */
5539 +       }
5540 +unlock:
5541 +       return csize;
5542 +}
5543 +
5544 +/**
5545 + * debug_max_fopen - Open function for "max" debugfs entry
5546 + * @inode: The in-kernel inode representation of the debugfs "file"
5547 + * @filp: The active open file structure for the debugfs "file"
5548 + *
5549 + * This function provides an open implementation for the "max" debugfs
5550 + * interface to the hardware latency detector.
5551 + */
5552 +static int debug_max_fopen(struct inode *inode, struct file *filp)
5553 +{
5554 +       return 0;
5555 +}
5556 +
5557 +/**
5558 + * debug_max_fread - Read function for "max" debugfs entry
5559 + * @filp: The active open file structure for the debugfs "file"
5560 + * @ubuf: The userspace provided buffer to read value into
5561 + * @cnt: The maximum number of bytes to read
5562 + * @ppos: The current "file" position
5563 + *
5564 + * This function provides a read implementation for the "max" debugfs
5565 + * interface to the hardware latency detector. Can be used to determine
5566 + * the maximum latency value observed since it was last reset.
5567 + */
5568 +static ssize_t debug_max_fread(struct file *filp, char __user *ubuf,
5569 +                                  size_t cnt, loff_t *ppos)
5570 +{
5571 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.max_sample);
5572 +}
5573 +
5574 +/**
5575 + * debug_max_fwrite - Write function for "max" debugfs entry
5576 + * @filp: The active open file structure for the debugfs "file"
5577 + * @ubuf: The user buffer that contains the value to write
5578 + * @cnt: The maximum number of bytes to write to "file"
5579 + * @ppos: The current position in the debugfs "file"
5580 + *
5581 + * This function provides a write implementation for the "max" debugfs
5582 + * interface to the hardware latency detector. Can be used to reset the
5583 + * maximum or set it to some other desired value - if, then, subsequent
5584 + * measurements exceed this value, the maximum will be updated.
5585 + */
5586 +static ssize_t  debug_max_fwrite(struct file *filp,
5587 +                                    const char __user *ubuf,
5588 +                                    size_t cnt,
5589 +                                    loff_t *ppos)
5590 +{
5591 +       return simple_data_write(filp, ubuf, cnt, ppos, &data.max_sample);
5592 +}
5593 +
5594 +
5595 +/**
5596 + * debug_sample_fopen - An open function for "sample" debugfs interface
5597 + * @inode: The in-kernel inode representation of this debugfs "file"
5598 + * @filp: The active open file structure for the debugfs "file"
5599 + *
5600 + * This function handles opening the "sample" file within the hardware
5601 + * latency detector debugfs directory interface. This file is used to read
5602 + * raw samples from the global ring_buffer and allows the user to see a
5603 + * running latency history. Can be opened blocking or non-blocking,
5604 + * affecting whether it behaves as a buffer read pipe, or does not.
5605 + * Implements simple locking to prevent multiple simultaneous use.
5606 + */
5607 +static int debug_sample_fopen(struct inode *inode, struct file *filp)
5608 +{
5609 +       if (!atomic_add_unless(&data.sample_open, 1, 1))
5610 +               return -EBUSY;
5611 +       else
5612 +               return 0;
5613 +}
5614 +
5615 +/**
5616 + * debug_sample_fread - A read function for "sample" debugfs interface
5617 + * @filp: The active open file structure for the debugfs "file"
5618 + * @ubuf: The user buffer that will contain the samples read
5619 + * @cnt: The maximum bytes to read from the debugfs "file"
5620 + * @ppos: The current position in the debugfs "file"
5621 + *
5622 + * This function handles reading from the "sample" file within the hardware
5623 + * latency detector debugfs directory interface. This file is used to read
5624 + * raw samples from the global ring_buffer and allows the user to see a
5625 + * running latency history. By default this will block pending a new
5626 + * value written into the sample buffer, unless there are already a
5627 + * number of value(s) waiting in the buffer, or the sample file was
5628 + * previously opened in a non-blocking mode of operation.
5629 + */
5630 +static ssize_t debug_sample_fread(struct file *filp, char __user *ubuf,
5631 +                                       size_t cnt, loff_t *ppos)
5632 +{
5633 +       int len = 0;
5634 +       char buf[64];
5635 +       struct sample *sample = NULL;
5636 +
5637 +       if (!enabled)
5638 +               return 0;
5639 +
5640 +       sample = kzalloc(sizeof(struct sample), GFP_KERNEL);
5641 +       if (!sample)
5642 +               return -ENOMEM;
5643 +
5644 +       while (!buffer_get_sample(sample)) {
5645 +
5646 +               DEFINE_WAIT(wait);
5647 +
5648 +               if (filp->f_flags & O_NONBLOCK) {
5649 +                       len = -EAGAIN;
5650 +                       goto out;
5651 +               }
5652 +
5653 +               prepare_to_wait(&data.wq, &wait, TASK_INTERRUPTIBLE);
5654 +               schedule();
5655 +               finish_wait(&data.wq, &wait);
5656 +
5657 +               if (signal_pending(current)) {
5658 +                       len = -EINTR;
5659 +                       goto out;
5660 +               }
5661 +
5662 +               if (!enabled) {                 /* enable was toggled */
5663 +                       len = 0;
5664 +                       goto out;
5665 +               }
5666 +       }
5667 +
5668 +       len = snprintf(buf, sizeof(buf), "%010lu.%010lu\t%llu\t%llu\n",
5669 +                      sample->timestamp.tv_sec,
5670 +                      sample->timestamp.tv_nsec,
5671 +                      sample->duration,
5672 +                      sample->outer_duration);
5673 +
5674 +
5675 +       /* handling partial reads is more trouble than it's worth */
5676 +       if (len > cnt)
5677 +               goto out;
5678 +
5679 +       if (copy_to_user(ubuf, buf, len))
5680 +               len = -EFAULT;
5681 +
5682 +out:
5683 +       kfree(sample);
5684 +       return len;
5685 +}
5686 +
5687 +/**
5688 + * debug_sample_release - Release function for "sample" debugfs interface
5689 + * @inode: The in-kernel inode represenation of the debugfs "file"
5690 + * @filp: The active open file structure for the debugfs "file"
5691 + *
5692 + * This function completes the close of the debugfs interface "sample" file.
5693 + * Frees the sample_open "lock" so that other users may open the interface.
5694 + */
5695 +static int debug_sample_release(struct inode *inode, struct file *filp)
5696 +{
5697 +       atomic_dec(&data.sample_open);
5698 +
5699 +       return 0;
5700 +}
5701 +
5702 +/**
5703 + * debug_threshold_fopen - Open function for "threshold" debugfs entry
5704 + * @inode: The in-kernel inode representation of the debugfs "file"
5705 + * @filp: The active open file structure for the debugfs "file"
5706 + *
5707 + * This function provides an open implementation for the "threshold" debugfs
5708 + * interface to the hardware latency detector.
5709 + */
5710 +static int debug_threshold_fopen(struct inode *inode, struct file *filp)
5711 +{
5712 +       return 0;
5713 +}
5714 +
5715 +/**
5716 + * debug_threshold_fread - Read function for "threshold" debugfs entry
5717 + * @filp: The active open file structure for the debugfs "file"
5718 + * @ubuf: The userspace provided buffer to read value into
5719 + * @cnt: The maximum number of bytes to read
5720 + * @ppos: The current "file" position
5721 + *
5722 + * This function provides a read implementation for the "threshold" debugfs
5723 + * interface to the hardware latency detector. It can be used to determine
5724 + * the current threshold level at which a latency will be recorded in the
5725 + * global ring buffer, typically on the order of 10us.
5726 + */
5727 +static ssize_t debug_threshold_fread(struct file *filp, char __user *ubuf,
5728 +                                        size_t cnt, loff_t *ppos)
5729 +{
5730 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.threshold);
5731 +}
5732 +
5733 +/**
5734 + * debug_threshold_fwrite - Write function for "threshold" debugfs entry
5735 + * @filp: The active open file structure for the debugfs "file"
5736 + * @ubuf: The user buffer that contains the value to write
5737 + * @cnt: The maximum number of bytes to write to "file"
5738 + * @ppos: The current position in the debugfs "file"
5739 + *
5740 + * This function provides a write implementation for the "threshold" debugfs
5741 + * interface to the hardware latency detector. It can be used to configure
5742 + * the threshold level at which any subsequently detected latencies will
5743 + * be recorded into the global ring buffer.
5744 + */
5745 +static ssize_t  debug_threshold_fwrite(struct file *filp,
5746 +                                       const char __user *ubuf,
5747 +                                       size_t cnt,
5748 +                                       loff_t *ppos)
5749 +{
5750 +       int ret;
5751 +
5752 +       ret = simple_data_write(filp, ubuf, cnt, ppos, &data.threshold);
5753 +
5754 +       if (enabled)
5755 +               wake_up_process(kthread);
5756 +
5757 +       return ret;
5758 +}
5759 +
5760 +/**
5761 + * debug_width_fopen - Open function for "width" debugfs entry
5762 + * @inode: The in-kernel inode representation of the debugfs "file"
5763 + * @filp: The active open file structure for the debugfs "file"
5764 + *
5765 + * This function provides an open implementation for the "width" debugfs
5766 + * interface to the hardware latency detector.
5767 + */
5768 +static int debug_width_fopen(struct inode *inode, struct file *filp)
5769 +{
5770 +       return 0;
5771 +}
5772 +
5773 +/**
5774 + * debug_width_fread - Read function for "width" debugfs entry
5775 + * @filp: The active open file structure for the debugfs "file"
5776 + * @ubuf: The userspace provided buffer to read value into
5777 + * @cnt: The maximum number of bytes to read
5778 + * @ppos: The current "file" position
5779 + *
5780 + * This function provides a read implementation for the "width" debugfs
5781 + * interface to the hardware latency detector. It can be used to determine
5782 + * for how many us of the total window us we will actively sample for any
5783 + * hardware-induced latecy periods. Obviously, it is not possible to
5784 + * sample constantly and have the system respond to a sample reader, or,
5785 + * worse, without having the system appear to have gone out to lunch.
5786 + */
5787 +static ssize_t debug_width_fread(struct file *filp, char __user *ubuf,
5788 +                                    size_t cnt, loff_t *ppos)
5789 +{
5790 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_width);
5791 +}
5792 +
5793 +/**
5794 + * debug_width_fwrite - Write function for "width" debugfs entry
5795 + * @filp: The active open file structure for the debugfs "file"
5796 + * @ubuf: The user buffer that contains the value to write
5797 + * @cnt: The maximum number of bytes to write to "file"
5798 + * @ppos: The current position in the debugfs "file"
5799 + *
5800 + * This function provides a write implementation for the "width" debugfs
5801 + * interface to the hardware latency detector. It can be used to configure
5802 + * for how many us of the total window us we will actively sample for any
5803 + * hardware-induced latency periods. Obviously, it is not possible to
5804 + * sample constantly and have the system respond to a sample reader, or,
5805 + * worse, without having the system appear to have gone out to lunch. It
5806 + * is enforced that width is less that the total window size.
5807 + */
5808 +static ssize_t  debug_width_fwrite(struct file *filp,
5809 +                                      const char __user *ubuf,
5810 +                                      size_t cnt,
5811 +                                      loff_t *ppos)
5812 +{
5813 +       char buf[U64STR_SIZE];
5814 +       int csize = min(cnt, sizeof(buf));
5815 +       u64 val = 0;
5816 +       int err = 0;
5817 +
5818 +       memset(buf, '\0', sizeof(buf));
5819 +       if (copy_from_user(buf, ubuf, csize))
5820 +               return -EFAULT;
5821 +
5822 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
5823 +       err = kstrtoull(buf, 10, &val);
5824 +       if (err)
5825 +               return -EINVAL;
5826 +
5827 +       mutex_lock(&data.lock);
5828 +       if (val < data.sample_window)
5829 +               data.sample_width = val;
5830 +       else {
5831 +               mutex_unlock(&data.lock);
5832 +               return -EINVAL;
5833 +       }
5834 +       mutex_unlock(&data.lock);
5835 +
5836 +       if (enabled)
5837 +               wake_up_process(kthread);
5838 +
5839 +       return csize;
5840 +}
5841 +
5842 +/**
5843 + * debug_window_fopen - Open function for "window" debugfs entry
5844 + * @inode: The in-kernel inode representation of the debugfs "file"
5845 + * @filp: The active open file structure for the debugfs "file"
5846 + *
5847 + * This function provides an open implementation for the "window" debugfs
5848 + * interface to the hardware latency detector. The window is the total time
5849 + * in us that will be considered one sample period. Conceptually, windows
5850 + * occur back-to-back and contain a sample width period during which
5851 + * actual sampling occurs.
5852 + */
5853 +static int debug_window_fopen(struct inode *inode, struct file *filp)
5854 +{
5855 +       return 0;
5856 +}
5857 +
5858 +/**
5859 + * debug_window_fread - Read function for "window" debugfs entry
5860 + * @filp: The active open file structure for the debugfs "file"
5861 + * @ubuf: The userspace provided buffer to read value into
5862 + * @cnt: The maximum number of bytes to read
5863 + * @ppos: The current "file" position
5864 + *
5865 + * This function provides a read implementation for the "window" debugfs
5866 + * interface to the hardware latency detector. The window is the total time
5867 + * in us that will be considered one sample period. Conceptually, windows
5868 + * occur back-to-back and contain a sample width period during which
5869 + * actual sampling occurs. Can be used to read the total window size.
5870 + */
5871 +static ssize_t debug_window_fread(struct file *filp, char __user *ubuf,
5872 +                                     size_t cnt, loff_t *ppos)
5873 +{
5874 +       return simple_data_read(filp, ubuf, cnt, ppos, &data.sample_window);
5875 +}
5876 +
5877 +/**
5878 + * debug_window_fwrite - Write function for "window" debugfs entry
5879 + * @filp: The active open file structure for the debugfs "file"
5880 + * @ubuf: The user buffer that contains the value to write
5881 + * @cnt: The maximum number of bytes to write to "file"
5882 + * @ppos: The current position in the debugfs "file"
5883 + *
5884 + * This function provides a write implementation for the "window" debufds
5885 + * interface to the hardware latency detetector. The window is the total time
5886 + * in us that will be considered one sample period. Conceptually, windows
5887 + * occur back-to-back and contain a sample width period during which
5888 + * actual sampling occurs. Can be used to write a new total window size. It
5889 + * is enfoced that any value written must be greater than the sample width
5890 + * size, or an error results.
5891 + */
5892 +static ssize_t  debug_window_fwrite(struct file *filp,
5893 +                                       const char __user *ubuf,
5894 +                                       size_t cnt,
5895 +                                       loff_t *ppos)
5896 +{
5897 +       char buf[U64STR_SIZE];
5898 +       int csize = min(cnt, sizeof(buf));
5899 +       u64 val = 0;
5900 +       int err = 0;
5901 +
5902 +       memset(buf, '\0', sizeof(buf));
5903 +       if (copy_from_user(buf, ubuf, csize))
5904 +               return -EFAULT;
5905 +
5906 +       buf[U64STR_SIZE-1] = '\0';                      /* just in case */
5907 +       err = kstrtoull(buf, 10, &val);
5908 +       if (err)
5909 +               return -EINVAL;
5910 +
5911 +       mutex_lock(&data.lock);
5912 +       if (data.sample_width < val)
5913 +               data.sample_window = val;
5914 +       else {
5915 +               mutex_unlock(&data.lock);
5916 +               return -EINVAL;
5917 +       }
5918 +       mutex_unlock(&data.lock);
5919 +
5920 +       return csize;
5921 +}
5922 +
5923 +/*
5924 + * Function pointers for the "count" debugfs file operations
5925 + */
5926 +static const struct file_operations count_fops = {
5927 +       .open           = debug_count_fopen,
5928 +       .read           = debug_count_fread,
5929 +       .write          = debug_count_fwrite,
5930 +       .owner          = THIS_MODULE,
5931 +};
5932 +
5933 +/*
5934 + * Function pointers for the "enable" debugfs file operations
5935 + */
5936 +static const struct file_operations enable_fops = {
5937 +       .open           = debug_enable_fopen,
5938 +       .read           = debug_enable_fread,
5939 +       .write          = debug_enable_fwrite,
5940 +       .owner          = THIS_MODULE,
5941 +};
5942 +
5943 +/*
5944 + * Function pointers for the "max" debugfs file operations
5945 + */
5946 +static const struct file_operations max_fops = {
5947 +       .open           = debug_max_fopen,
5948 +       .read           = debug_max_fread,
5949 +       .write          = debug_max_fwrite,
5950 +       .owner          = THIS_MODULE,
5951 +};
5952 +
5953 +/*
5954 + * Function pointers for the "sample" debugfs file operations
5955 + */
5956 +static const struct file_operations sample_fops = {
5957 +       .open           = debug_sample_fopen,
5958 +       .read           = debug_sample_fread,
5959 +       .release        = debug_sample_release,
5960 +       .owner          = THIS_MODULE,
5961 +};
5962 +
5963 +/*
5964 + * Function pointers for the "threshold" debugfs file operations
5965 + */
5966 +static const struct file_operations threshold_fops = {
5967 +       .open           = debug_threshold_fopen,
5968 +       .read           = debug_threshold_fread,
5969 +       .write          = debug_threshold_fwrite,
5970 +       .owner          = THIS_MODULE,
5971 +};
5972 +
5973 +/*
5974 + * Function pointers for the "width" debugfs file operations
5975 + */
5976 +static const struct file_operations width_fops = {
5977 +       .open           = debug_width_fopen,
5978 +       .read           = debug_width_fread,
5979 +       .write          = debug_width_fwrite,
5980 +       .owner          = THIS_MODULE,
5981 +};
5982 +
5983 +/*
5984 + * Function pointers for the "window" debugfs file operations
5985 + */
5986 +static const struct file_operations window_fops = {
5987 +       .open           = debug_window_fopen,
5988 +       .read           = debug_window_fread,
5989 +       .write          = debug_window_fwrite,
5990 +       .owner          = THIS_MODULE,
5991 +};
5992 +
5993 +/**
5994 + * init_debugfs - A function to initialize the debugfs interface files
5995 + *
5996 + * This function creates entries in debugfs for "hwlat_detector", including
5997 + * files to read values from the detector, current samples, and the
5998 + * maximum sample that has been captured since the hardware latency
5999 + * dectector was started.
6000 + */
6001 +static int init_debugfs(void)
6002 +{
6003 +       int ret = -ENOMEM;
6004 +
6005 +       debug_dir = debugfs_create_dir(DRVNAME, NULL);
6006 +       if (!debug_dir)
6007 +               goto err_debug_dir;
6008 +
6009 +       debug_sample = debugfs_create_file("sample", 0444,
6010 +                                              debug_dir, NULL,
6011 +                                              &sample_fops);
6012 +       if (!debug_sample)
6013 +               goto err_sample;
6014 +
6015 +       debug_count = debugfs_create_file("count", 0444,
6016 +                                             debug_dir, NULL,
6017 +                                             &count_fops);
6018 +       if (!debug_count)
6019 +               goto err_count;
6020 +
6021 +       debug_max = debugfs_create_file("max", 0444,
6022 +                                           debug_dir, NULL,
6023 +                                           &max_fops);
6024 +       if (!debug_max)
6025 +               goto err_max;
6026 +
6027 +       debug_sample_window = debugfs_create_file("window", 0644,
6028 +                                                     debug_dir, NULL,
6029 +                                                     &window_fops);
6030 +       if (!debug_sample_window)
6031 +               goto err_window;
6032 +
6033 +       debug_sample_width = debugfs_create_file("width", 0644,
6034 +                                                    debug_dir, NULL,
6035 +                                                    &width_fops);
6036 +       if (!debug_sample_width)
6037 +               goto err_width;
6038 +
6039 +       debug_threshold = debugfs_create_file("threshold", 0644,
6040 +                                                 debug_dir, NULL,
6041 +                                                 &threshold_fops);
6042 +       if (!debug_threshold)
6043 +               goto err_threshold;
6044 +
6045 +       debug_enable = debugfs_create_file("enable", 0644,
6046 +                                              debug_dir, &enabled,
6047 +                                              &enable_fops);
6048 +       if (!debug_enable)
6049 +               goto err_enable;
6050 +
6051 +       else {
6052 +               ret = 0;
6053 +               goto out;
6054 +       }
6055 +
6056 +err_enable:
6057 +       debugfs_remove(debug_threshold);
6058 +err_threshold:
6059 +       debugfs_remove(debug_sample_width);
6060 +err_width:
6061 +       debugfs_remove(debug_sample_window);
6062 +err_window:
6063 +       debugfs_remove(debug_max);
6064 +err_max:
6065 +       debugfs_remove(debug_count);
6066 +err_count:
6067 +       debugfs_remove(debug_sample);
6068 +err_sample:
6069 +       debugfs_remove(debug_dir);
6070 +err_debug_dir:
6071 +out:
6072 +       return ret;
6073 +}
6074 +
6075 +/**
6076 + * free_debugfs - A function to cleanup the debugfs file interface
6077 + */
6078 +static void free_debugfs(void)
6079 +{
6080 +       /* could also use a debugfs_remove_recursive */
6081 +       debugfs_remove(debug_enable);
6082 +       debugfs_remove(debug_threshold);
6083 +       debugfs_remove(debug_sample_width);
6084 +       debugfs_remove(debug_sample_window);
6085 +       debugfs_remove(debug_max);
6086 +       debugfs_remove(debug_count);
6087 +       debugfs_remove(debug_sample);
6088 +       debugfs_remove(debug_dir);
6089 +}
6090 +
6091 +/**
6092 + * detector_init - Standard module initialization code
6093 + */
6094 +static int detector_init(void)
6095 +{
6096 +       int ret = -ENOMEM;
6097 +
6098 +       pr_info(BANNER "version %s\n", VERSION);
6099 +
6100 +       ret = init_stats();
6101 +       if (ret)
6102 +               goto out;
6103 +
6104 +       ret = init_debugfs();
6105 +       if (ret)
6106 +               goto err_stats;
6107 +
6108 +       if (enabled)
6109 +               ret = start_kthread();
6110 +
6111 +       goto out;
6112 +
6113 +err_stats:
6114 +       ring_buffer_free(ring_buffer);
6115 +out:
6116 +       return ret;
6117 +
6118 +}
6119 +
6120 +/**
6121 + * detector_exit - Standard module cleanup code
6122 + */
6123 +static void detector_exit(void)
6124 +{
6125 +       int err;
6126 +
6127 +       if (enabled) {
6128 +               enabled = 0;
6129 +               err = stop_kthread();
6130 +               if (err)
6131 +                       pr_err(BANNER "cannot stop kthread\n");
6132 +       }
6133 +
6134 +       free_debugfs();
6135 +       ring_buffer_free(ring_buffer);  /* free up the ring buffer */
6136 +
6137 +}
6138 +
6139 +module_init(detector_init);
6140 +module_exit(detector_exit);
6141 diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
6142 index df990bb8c873..1a162709a85e 100644
6143 --- a/drivers/mmc/host/mmci.c
6144 +++ b/drivers/mmc/host/mmci.c
6145 @@ -1147,15 +1147,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
6146         struct sg_mapping_iter *sg_miter = &host->sg_miter;
6147         struct variant_data *variant = host->variant;
6148         void __iomem *base = host->base;
6149 -       unsigned long flags;
6150         u32 status;
6151
6152         status = readl(base + MMCISTATUS);
6153
6154         dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status);
6155
6156 -       local_irq_save(flags);
6157 -
6158         do {
6159                 unsigned int remain, len;
6160                 char *buffer;
6161 @@ -1195,8 +1192,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id)
6162
6163         sg_miter_stop(sg_miter);
6164
6165 -       local_irq_restore(flags);
6166 -
6167         /*
6168          * If we have less than the fifo 'half-full' threshold to transfer,
6169          * trigger a PIO interrupt as soon as any data is available.
6170 diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
6171 index 25c55ab05c7d..5a1d117a8744 100644
6172 --- a/drivers/net/ethernet/3com/3c59x.c
6173 +++ b/drivers/net/ethernet/3com/3c59x.c
6174 @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev)
6175  {
6176         struct vortex_private *vp = netdev_priv(dev);
6177         unsigned long flags;
6178 -       local_irq_save(flags);
6179 +       local_irq_save_nort(flags);
6180         (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev);
6181 -       local_irq_restore(flags);
6182 +       local_irq_restore_nort(flags);
6183  }
6184  #endif
6185
6186 @@ -1910,12 +1910,12 @@ static void vortex_tx_timeout(struct net_device *dev)
6187                          * Block interrupts because vortex_interrupt does a bare spin_lock()
6188                          */
6189                         unsigned long flags;
6190 -                       local_irq_save(flags);
6191 +                       local_irq_save_nort(flags);
6192                         if (vp->full_bus_master_tx)
6193                                 boomerang_interrupt(dev->irq, dev);
6194                         else
6195                                 vortex_interrupt(dev->irq, dev);
6196 -                       local_irq_restore(flags);
6197 +                       local_irq_restore_nort(flags);
6198                 }
6199         }
6200
6201 diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
6202 index da4c2d8a4173..1420dfb56bac 100644
6203 --- a/drivers/net/ethernet/realtek/8139too.c
6204 +++ b/drivers/net/ethernet/realtek/8139too.c
6205 @@ -2233,7 +2233,7 @@ static void rtl8139_poll_controller(struct net_device *dev)
6206         struct rtl8139_private *tp = netdev_priv(dev);
6207         const int irq = tp->pci_dev->irq;
6208
6209 -       disable_irq(irq);
6210 +       disable_irq_nosync(irq);
6211         rtl8139_interrupt(irq, dev);
6212         enable_irq(irq);
6213  }
6214 diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
6215 index 56f109bc8394..02afc796bc71 100644
6216 --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
6217 +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
6218 @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv,
6219                         while (!ctx->done.done && msecs--)
6220                                 udelay(1000);
6221                 } else {
6222 -                       wait_event_interruptible(ctx->done.wait,
6223 +                       swait_event_interruptible(ctx->done.wait,
6224                                                  ctx->done.done);
6225                 }
6226                 break;
6227 diff --git a/drivers/pci/access.c b/drivers/pci/access.c
6228 index d11cdbb8fba3..223bbb9acb03 100644
6229 --- a/drivers/pci/access.c
6230 +++ b/drivers/pci/access.c
6231 @@ -672,7 +672,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
6232         WARN_ON(!dev->block_cfg_access);
6233
6234         dev->block_cfg_access = 0;
6235 -       wake_up_all(&pci_cfg_wait);
6236 +       wake_up_all_locked(&pci_cfg_wait);
6237         raw_spin_unlock_irqrestore(&pci_lock, flags);
6238  }
6239  EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
6240 diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
6241 index 9bd41a35a78a..8e2d436c2e3f 100644
6242 --- a/drivers/scsi/fcoe/fcoe.c
6243 +++ b/drivers/scsi/fcoe/fcoe.c
6244 @@ -1455,11 +1455,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev,
6245  static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen)
6246  {
6247         struct fcoe_percpu_s *fps;
6248 -       int rc;
6249 +       int rc, cpu = get_cpu_light();
6250
6251 -       fps = &get_cpu_var(fcoe_percpu);
6252 +       fps = &per_cpu(fcoe_percpu, cpu);
6253         rc = fcoe_get_paged_crc_eof(skb, tlen, fps);
6254 -       put_cpu_var(fcoe_percpu);
6255 +       put_cpu_light();
6256
6257         return rc;
6258  }
6259 @@ -1646,11 +1646,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport,
6260                 return 0;
6261         }
6262
6263 -       stats = per_cpu_ptr(lport->stats, get_cpu());
6264 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
6265         stats->InvalidCRCCount++;
6266         if (stats->InvalidCRCCount < 5)
6267                 printk(KERN_WARNING "fcoe: dropping frame with CRC error\n");
6268 -       put_cpu();
6269 +       put_cpu_light();
6270         return -EINVAL;
6271  }
6272
6273 @@ -1693,7 +1693,7 @@ static void fcoe_recv_frame(struct sk_buff *skb)
6274          */
6275         hp = (struct fcoe_hdr *) skb_network_header(skb);
6276
6277 -       stats = per_cpu_ptr(lport->stats, get_cpu());
6278 +       stats = per_cpu_ptr(lport->stats, get_cpu_light());
6279         if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) {
6280                 if (stats->ErrorFrames < 5)
6281                         printk(KERN_WARNING "fcoe: FCoE version "
6282 @@ -1725,13 +1725,13 @@ static void fcoe_recv_frame(struct sk_buff *skb)
6283                 goto drop;
6284
6285         if (!fcoe_filter_frames(lport, fp)) {
6286 -               put_cpu();
6287 +               put_cpu_light();
6288                 fc_exch_recv(lport, fp);
6289                 return;
6290         }
6291  drop:
6292         stats->ErrorFrames++;
6293 -       put_cpu();
6294 +       put_cpu_light();
6295         kfree_skb(skb);
6296  }
6297
6298 diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
6299 index dcf36537a767..1a1f2e46452c 100644
6300 --- a/drivers/scsi/fcoe/fcoe_ctlr.c
6301 +++ b/drivers/scsi/fcoe/fcoe_ctlr.c
6302 @@ -834,7 +834,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
6303
6304         INIT_LIST_HEAD(&del_list);
6305
6306 -       stats = per_cpu_ptr(fip->lp->stats, get_cpu());
6307 +       stats = per_cpu_ptr(fip->lp->stats, get_cpu_light());
6308
6309         list_for_each_entry_safe(fcf, next, &fip->fcfs, list) {
6310                 deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2;
6311 @@ -870,7 +870,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip)
6312                                 sel_time = fcf->time;
6313                 }
6314         }
6315 -       put_cpu();
6316 +       put_cpu_light();
6317
6318         list_for_each_entry_safe(fcf, next, &del_list, list) {
6319                 /* Removes fcf from current list */
6320 diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
6321 index e72673b0a8fb..da598a6caa22 100644
6322 --- a/drivers/scsi/libfc/fc_exch.c
6323 +++ b/drivers/scsi/libfc/fc_exch.c
6324 @@ -814,10 +814,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport,
6325         }
6326         memset(ep, 0, sizeof(*ep));
6327
6328 -       cpu = get_cpu();
6329 +       cpu = get_cpu_light();
6330         pool = per_cpu_ptr(mp->pool, cpu);
6331         spin_lock_bh(&pool->lock);
6332 -       put_cpu();
6333 +       put_cpu_light();
6334
6335         /* peek cache of free slot */
6336         if (pool->left != FC_XID_UNKNOWN) {
6337 diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
6338 index 763f012fdeca..d0f61b595470 100644
6339 --- a/drivers/scsi/libsas/sas_ata.c
6340 +++ b/drivers/scsi/libsas/sas_ata.c
6341 @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
6342         /* TODO: audit callers to ensure they are ready for qc_issue to
6343          * unconditionally re-enable interrupts
6344          */
6345 -       local_irq_save(flags);
6346 +       local_irq_save_nort(flags);
6347         spin_unlock(ap->lock);
6348
6349         /* If the device fell off, no sense in issuing commands */
6350 @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
6351
6352   out:
6353         spin_lock(ap->lock);
6354 -       local_irq_restore(flags);
6355 +       local_irq_restore_nort(flags);
6356         return ret;
6357  }
6358
6359 diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h
6360 index edc48f3b8230..ee5c6f9dfb6f 100644
6361 --- a/drivers/scsi/qla2xxx/qla_inline.h
6362 +++ b/drivers/scsi/qla2xxx/qla_inline.h
6363 @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp)
6364  {
6365         unsigned long flags;
6366         struct qla_hw_data *ha = rsp->hw;
6367 -       local_irq_save(flags);
6368 +       local_irq_save_nort(flags);
6369         if (IS_P3P_TYPE(ha))
6370                 qla82xx_poll(0, rsp);
6371         else
6372                 ha->isp_ops->intr_handler(0, rsp);
6373 -       local_irq_restore(flags);
6374 +       local_irq_restore_nort(flags);
6375  }
6376
6377  static inline uint8_t *
6378 diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
6379 index 987f1c729e9c..18391e07d70f 100644
6380 --- a/drivers/scsi/qla2xxx/qla_isr.c
6381 +++ b/drivers/scsi/qla2xxx/qla_isr.c
6382 @@ -3125,7 +3125,11 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
6383                 * kref_put().
6384                 */
6385                 kref_get(&qentry->irq_notify.kref);
6386 +#ifdef CONFIG_PREEMPT_RT_BASE
6387 +               swork_queue(&qentry->irq_notify.swork);
6388 +#else
6389                 schedule_work(&qentry->irq_notify.work);
6390 +#endif
6391         }
6392
6393         /*
6394 diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
6395 index 97f0a2bd93ed..a4f45aaa9ad4 100644
6396 --- a/drivers/thermal/x86_pkg_temp_thermal.c
6397 +++ b/drivers/thermal/x86_pkg_temp_thermal.c
6398 @@ -29,6 +29,7 @@
6399  #include <linux/pm.h>
6400  #include <linux/thermal.h>
6401  #include <linux/debugfs.h>
6402 +#include <linux/swork.h>
6403  #include <asm/cpu_device_id.h>
6404  #include <asm/mce.h>
6405
6406 @@ -352,7 +353,7 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
6407         }
6408  }
6409
6410 -static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
6411 +static void platform_thermal_notify_work(struct swork_event *event)
6412  {
6413         unsigned long flags;
6414         int cpu = smp_processor_id();
6415 @@ -369,7 +370,7 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
6416                         pkg_work_scheduled[phy_id]) {
6417                 disable_pkg_thres_interrupt();
6418                 spin_unlock_irqrestore(&pkg_work_lock, flags);
6419 -               return -EINVAL;
6420 +               return;
6421         }
6422         pkg_work_scheduled[phy_id] = 1;
6423         spin_unlock_irqrestore(&pkg_work_lock, flags);
6424 @@ -378,9 +379,48 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
6425         schedule_delayed_work_on(cpu,
6426                                 &per_cpu(pkg_temp_thermal_threshold_work, cpu),
6427                                 msecs_to_jiffies(notify_delay_ms));
6428 +}
6429 +
6430 +#ifdef CONFIG_PREEMPT_RT_FULL
6431 +static struct swork_event notify_work;
6432 +
6433 +static int thermal_notify_work_init(void)
6434 +{
6435 +       int err;
6436 +
6437 +       err = swork_get();
6438 +       if (err)
6439 +               return err;
6440 +
6441 +       INIT_SWORK(&notify_work, platform_thermal_notify_work);
6442         return 0;
6443  }
6444
6445 +static void thermal_notify_work_cleanup(void)
6446 +{
6447 +       swork_put();
6448 +}
6449 +
6450 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
6451 +{
6452 +       swork_queue(&notify_work);
6453 +       return 0;
6454 +}
6455 +
6456 +#else  /* !CONFIG_PREEMPT_RT_FULL */
6457 +
6458 +static int thermal_notify_work_init(void) { return 0; }
6459 +
6460 +static void thermal_notify_work_cleanup(void) {  }
6461 +
6462 +static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val)
6463 +{
6464 +       platform_thermal_notify_work(NULL);
6465 +
6466 +       return 0;
6467 +}
6468 +#endif /* CONFIG_PREEMPT_RT_FULL */
6469 +
6470  static int find_siblings_cpu(int cpu)
6471  {
6472         int i;
6473 @@ -584,6 +624,9 @@ static int __init pkg_temp_thermal_init(void)
6474         if (!x86_match_cpu(pkg_temp_thermal_ids))
6475                 return -ENODEV;
6476
6477 +       if (!thermal_notify_work_init())
6478 +               return -ENODEV;
6479 +
6480         spin_lock_init(&pkg_work_lock);
6481         platform_thermal_package_notify =
6482                         pkg_temp_thermal_platform_thermal_notify;
6483 @@ -608,7 +651,7 @@ static int __init pkg_temp_thermal_init(void)
6484         kfree(pkg_work_scheduled);
6485         platform_thermal_package_notify = NULL;
6486         platform_thermal_package_rate_control = NULL;
6487 -
6488 +       thermal_notify_work_cleanup();
6489         return -ENODEV;
6490  }
6491
6492 @@ -633,6 +676,7 @@ static void __exit pkg_temp_thermal_exit(void)
6493         mutex_unlock(&phy_dev_list_mutex);
6494         platform_thermal_package_notify = NULL;
6495         platform_thermal_package_rate_control = NULL;
6496 +       thermal_notify_work_cleanup();
6497         for_each_online_cpu(i)
6498                 cancel_delayed_work_sync(
6499                         &per_cpu(pkg_temp_thermal_threshold_work, i));
6500 diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
6501 index dcf43f66404f..a9ae57122841 100644
6502 --- a/drivers/tty/serial/8250/8250_core.c
6503 +++ b/drivers/tty/serial/8250/8250_core.c
6504 @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg;
6505
6506  static unsigned int skip_txen_test; /* force skip of txen test at init time */
6507
6508 -#define PASS_LIMIT     512
6509 +/*
6510 + * On -rt we can have a more delays, and legitimately
6511 + * so - so don't drop work spuriously and spam the
6512 + * syslog:
6513 + */
6514 +#ifdef CONFIG_PREEMPT_RT_FULL
6515 +# define PASS_LIMIT    1000000
6516 +#else
6517 +# define PASS_LIMIT    512
6518 +#endif
6519
6520  #include <asm/serial.h>
6521  /*
6522 diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
6523 index 858a54633664..fc44fb59aef6 100644
6524 --- a/drivers/tty/serial/8250/8250_port.c
6525 +++ b/drivers/tty/serial/8250/8250_port.c
6526 @@ -35,6 +35,7 @@
6527  #include <linux/nmi.h>
6528  #include <linux/mutex.h>
6529  #include <linux/slab.h>
6530 +#include <linux/kdb.h>
6531  #include <linux/uaccess.h>
6532  #include <linux/pm_runtime.h>
6533  #include <linux/timer.h>
6534 @@ -3109,9 +3110,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
6535
6536         serial8250_rpm_get(up);
6537
6538 -       if (port->sysrq)
6539 +       if (port->sysrq || oops_in_progress)
6540                 locked = 0;
6541 -       else if (oops_in_progress)
6542 +       else if (in_kdb_printk())
6543                 locked = spin_trylock_irqsave(&port->lock, flags);
6544         else
6545                 spin_lock_irqsave(&port->lock, flags);
6546 diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
6547 index 8a9e213387a7..dd1f9a426b74 100644
6548 --- a/drivers/tty/serial/amba-pl011.c
6549 +++ b/drivers/tty/serial/amba-pl011.c
6550 @@ -2167,13 +2167,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
6551
6552         clk_enable(uap->clk);
6553
6554 -       local_irq_save(flags);
6555 +       /*
6556 +        * local_irq_save(flags);
6557 +        *
6558 +        * This local_irq_save() is nonsense. If we come in via sysrq
6559 +        * handling then interrupts are already disabled. Aside of
6560 +        * that the port.sysrq check is racy on SMP regardless.
6561 +       */
6562         if (uap->port.sysrq)
6563                 locked = 0;
6564         else if (oops_in_progress)
6565 -               locked = spin_trylock(&uap->port.lock);
6566 +               locked = spin_trylock_irqsave(&uap->port.lock, flags);
6567         else
6568 -               spin_lock(&uap->port.lock);
6569 +               spin_lock_irqsave(&uap->port.lock, flags);
6570
6571         /*
6572          *      First save the CR then disable the interrupts
6573 @@ -2197,8 +2203,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count)
6574                 pl011_write(old_cr, uap, REG_CR);
6575
6576         if (locked)
6577 -               spin_unlock(&uap->port.lock);
6578 -       local_irq_restore(flags);
6579 +               spin_unlock_irqrestore(&uap->port.lock, flags);
6580
6581         clk_disable(uap->clk);
6582  }
6583 diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c
6584 index a2a529994ba5..0ee7c4c518df 100644
6585 --- a/drivers/tty/serial/omap-serial.c
6586 +++ b/drivers/tty/serial/omap-serial.c
6587 @@ -1257,13 +1257,10 @@ serial_omap_console_write(struct console *co, const char *s,
6588
6589         pm_runtime_get_sync(up->dev);
6590
6591 -       local_irq_save(flags);
6592 -       if (up->port.sysrq)
6593 -               locked = 0;
6594 -       else if (oops_in_progress)
6595 -               locked = spin_trylock(&up->port.lock);
6596 +       if (up->port.sysrq || oops_in_progress)
6597 +               locked = spin_trylock_irqsave(&up->port.lock, flags);
6598         else
6599 -               spin_lock(&up->port.lock);
6600 +               spin_lock_irqsave(&up->port.lock, flags);
6601
6602         /*
6603          * First save the IER then disable the interrupts
6604 @@ -1292,8 +1289,7 @@ serial_omap_console_write(struct console *co, const char *s,
6605         pm_runtime_mark_last_busy(up->dev);
6606         pm_runtime_put_autosuspend(up->dev);
6607         if (locked)
6608 -               spin_unlock(&up->port.lock);
6609 -       local_irq_restore(flags);
6610 +               spin_unlock_irqrestore(&up->port.lock, flags);
6611  }
6612
6613  static int __init
6614 diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
6615 index f36e6df2fa90..e086ea4d2997 100644
6616 --- a/drivers/tty/serial/sc16is7xx.c
6617 +++ b/drivers/tty/serial/sc16is7xx.c
6618 @@ -1240,7 +1240,7 @@ static int sc16is7xx_probe(struct device *dev,
6619
6620         /* Setup interrupt */
6621         ret = devm_request_irq(dev, irq, sc16is7xx_irq,
6622 -                              IRQF_ONESHOT | flags, dev_name(dev), s);
6623 +                              flags, dev_name(dev), s);
6624         if (!ret)
6625                 return 0;
6626
6627 diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
6628 index d2e3f655c26f..fdd027a9bbd7 100644
6629 --- a/drivers/usb/core/hcd.c
6630 +++ b/drivers/usb/core/hcd.c
6631 @@ -1760,9 +1760,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
6632          * and no one may trigger the above deadlock situation when
6633          * running complete() in tasklet.
6634          */
6635 -       local_irq_save(flags);
6636 +       local_irq_save_nort(flags);
6637         urb->complete(urb);
6638 -       local_irq_restore(flags);
6639 +       local_irq_restore_nort(flags);
6640
6641         usb_anchor_resume_wakeups(anchor);
6642         atomic_dec(&urb->use_count);
6643 diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
6644 index 5c8429f23a89..fa835fb1a186 100644
6645 --- a/drivers/usb/gadget/function/f_fs.c
6646 +++ b/drivers/usb/gadget/function/f_fs.c
6647 @@ -1509,7 +1509,7 @@ static void ffs_data_put(struct ffs_data *ffs)
6648                 pr_info("%s(): freeing\n", __func__);
6649                 ffs_data_clear(ffs);
6650                 BUG_ON(waitqueue_active(&ffs->ev.waitq) ||
6651 -                      waitqueue_active(&ffs->ep0req_completion.wait));
6652 +                      swait_active(&ffs->ep0req_completion.wait));
6653                 kfree(ffs->dev_name);
6654                 kfree(ffs);
6655         }
6656 diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
6657 index 16104b5ebdcb..5c506c2b88ad 100644
6658 --- a/drivers/usb/gadget/legacy/inode.c
6659 +++ b/drivers/usb/gadget/legacy/inode.c
6660 @@ -346,7 +346,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
6661         spin_unlock_irq (&epdata->dev->lock);
6662
6663         if (likely (value == 0)) {
6664 -               value = wait_event_interruptible (done.wait, done.done);
6665 +               value = swait_event_interruptible (done.wait, done.done);
6666                 if (value != 0) {
6667                         spin_lock_irq (&epdata->dev->lock);
6668                         if (likely (epdata->ep != NULL)) {
6669 @@ -355,7 +355,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len)
6670                                 usb_ep_dequeue (epdata->ep, epdata->req);
6671                                 spin_unlock_irq (&epdata->dev->lock);
6672
6673 -                               wait_event (done.wait, done.done);
6674 +                               swait_event (done.wait, done.done);
6675                                 if (epdata->status == -ECONNRESET)
6676                                         epdata->status = -EINTR;
6677                         } else {
6678 diff --git a/fs/aio.c b/fs/aio.c
6679 index 4fe81d1c60f9..e68c06a4a017 100644
6680 --- a/fs/aio.c
6681 +++ b/fs/aio.c
6682 @@ -40,6 +40,7 @@
6683  #include <linux/ramfs.h>
6684  #include <linux/percpu-refcount.h>
6685  #include <linux/mount.h>
6686 +#include <linux/swork.h>
6687
6688  #include <asm/kmap_types.h>
6689  #include <asm/uaccess.h>
6690 @@ -115,7 +116,7 @@ struct kioctx {
6691         struct page             **ring_pages;
6692         long                    nr_pages;
6693
6694 -       struct work_struct      free_work;
6695 +       struct swork_event      free_work;
6696
6697         /*
6698          * signals when all in-flight requests are done
6699 @@ -258,6 +259,7 @@ static int __init aio_setup(void)
6700                 .mount          = aio_mount,
6701                 .kill_sb        = kill_anon_super,
6702         };
6703 +       BUG_ON(swork_get());
6704         aio_mnt = kern_mount(&aio_fs);
6705         if (IS_ERR(aio_mnt))
6706                 panic("Failed to create aio fs mount.");
6707 @@ -578,9 +580,9 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
6708         return cancel(&kiocb->common);
6709  }
6710
6711 -static void free_ioctx(struct work_struct *work)
6712 +static void free_ioctx(struct swork_event *sev)
6713  {
6714 -       struct kioctx *ctx = container_of(work, struct kioctx, free_work);
6715 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
6716
6717         pr_debug("freeing %p\n", ctx);
6718
6719 @@ -599,8 +601,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
6720         if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
6721                 complete(&ctx->rq_wait->comp);
6722
6723 -       INIT_WORK(&ctx->free_work, free_ioctx);
6724 -       schedule_work(&ctx->free_work);
6725 +       INIT_SWORK(&ctx->free_work, free_ioctx);
6726 +       swork_queue(&ctx->free_work);
6727  }
6728
6729  /*
6730 @@ -608,9 +610,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
6731   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
6732   * now it's safe to cancel any that need to be.
6733   */
6734 -static void free_ioctx_users(struct percpu_ref *ref)
6735 +static void free_ioctx_users_work(struct swork_event *sev)
6736  {
6737 -       struct kioctx *ctx = container_of(ref, struct kioctx, users);
6738 +       struct kioctx *ctx = container_of(sev, struct kioctx, free_work);
6739         struct aio_kiocb *req;
6740
6741         spin_lock_irq(&ctx->ctx_lock);
6742 @@ -629,6 +631,14 @@ static void free_ioctx_users(struct percpu_ref *ref)
6743         percpu_ref_put(&ctx->reqs);
6744  }
6745
6746 +static void free_ioctx_users(struct percpu_ref *ref)
6747 +{
6748 +       struct kioctx *ctx = container_of(ref, struct kioctx, users);
6749 +
6750 +       INIT_SWORK(&ctx->free_work, free_ioctx_users_work);
6751 +       swork_queue(&ctx->free_work);
6752 +}
6753 +
6754  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
6755  {
6756         unsigned i, new_nr;
6757 diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
6758 index a439548de785..7c392647d03b 100644
6759 --- a/fs/autofs4/autofs_i.h
6760 +++ b/fs/autofs4/autofs_i.h
6761 @@ -30,6 +30,7 @@
6762  #include <linux/sched.h>
6763  #include <linux/mount.h>
6764  #include <linux/namei.h>
6765 +#include <linux/delay.h>
6766  #include <asm/current.h>
6767  #include <linux/uaccess.h>
6768
6769 diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
6770 index d8e6d421c27f..2e689ab1306b 100644
6771 --- a/fs/autofs4/expire.c
6772 +++ b/fs/autofs4/expire.c
6773 @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev,
6774                         parent = p->d_parent;
6775                         if (!spin_trylock(&parent->d_lock)) {
6776                                 spin_unlock(&p->d_lock);
6777 -                               cpu_relax();
6778 +                               cpu_chill();
6779                                 goto relock;
6780                         }
6781                         spin_unlock(&p->d_lock);
6782 diff --git a/fs/buffer.c b/fs/buffer.c
6783 index 9c8eb9b6db6a..d15d77f72cf7 100644
6784 --- a/fs/buffer.c
6785 +++ b/fs/buffer.c
6786 @@ -301,8 +301,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
6787          * decide that the page is now completely done.
6788          */
6789         first = page_buffers(page);
6790 -       local_irq_save(flags);
6791 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6792 +       flags = bh_uptodate_lock_irqsave(first);
6793         clear_buffer_async_read(bh);
6794         unlock_buffer(bh);
6795         tmp = bh;
6796 @@ -315,8 +314,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
6797                 }
6798                 tmp = tmp->b_this_page;
6799         } while (tmp != bh);
6800 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6801 -       local_irq_restore(flags);
6802 +       bh_uptodate_unlock_irqrestore(first, flags);
6803
6804         /*
6805          * If none of the buffers had errors and they are all
6806 @@ -328,9 +326,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
6807         return;
6808
6809  still_busy:
6810 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6811 -       local_irq_restore(flags);
6812 -       return;
6813 +       bh_uptodate_unlock_irqrestore(first, flags);
6814  }
6815
6816  /*
6817 @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
6818         }
6819
6820         first = page_buffers(page);
6821 -       local_irq_save(flags);
6822 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
6823 +       flags = bh_uptodate_lock_irqsave(first);
6824
6825         clear_buffer_async_write(bh);
6826         unlock_buffer(bh);
6827 @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
6828                 }
6829                 tmp = tmp->b_this_page;
6830         }
6831 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6832 -       local_irq_restore(flags);
6833 +       bh_uptodate_unlock_irqrestore(first, flags);
6834         end_page_writeback(page);
6835         return;
6836
6837  still_busy:
6838 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
6839 -       local_irq_restore(flags);
6840 -       return;
6841 +       bh_uptodate_unlock_irqrestore(first, flags);
6842  }
6843  EXPORT_SYMBOL(end_buffer_async_write);
6844
6845 @@ -3384,6 +3376,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
6846         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
6847         if (ret) {
6848                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
6849 +               buffer_head_init_locks(ret);
6850                 preempt_disable();
6851                 __this_cpu_inc(bh_accounting.nr);
6852                 recalc_bh_state();
6853 diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
6854 index 8f6a2a5863b9..4217828d0b68 100644
6855 --- a/fs/cifs/readdir.c
6856 +++ b/fs/cifs/readdir.c
6857 @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
6858         struct inode *inode;
6859         struct super_block *sb = parent->d_sb;
6860         struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
6861 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
6862 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
6863
6864         cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
6865
6866 diff --git a/fs/dcache.c b/fs/dcache.c
6867 index 5c7cc953ac81..a9bb31f1c1af 100644
6868 --- a/fs/dcache.c
6869 +++ b/fs/dcache.c
6870 @@ -19,6 +19,7 @@
6871  #include <linux/mm.h>
6872  #include <linux/fs.h>
6873  #include <linux/fsnotify.h>
6874 +#include <linux/delay.h>
6875  #include <linux/slab.h>
6876  #include <linux/init.h>
6877  #include <linux/hash.h>
6878 @@ -750,6 +751,8 @@ static inline bool fast_dput(struct dentry *dentry)
6879   */
6880  void dput(struct dentry *dentry)
6881  {
6882 +       struct dentry *parent;
6883 +
6884         if (unlikely(!dentry))
6885                 return;
6886
6887 @@ -788,9 +791,18 @@ void dput(struct dentry *dentry)
6888         return;
6889
6890  kill_it:
6891 -       dentry = dentry_kill(dentry);
6892 -       if (dentry) {
6893 -               cond_resched();
6894 +       parent = dentry_kill(dentry);
6895 +       if (parent) {
6896 +               int r;
6897 +
6898 +               if (parent == dentry) {
6899 +                       /* the task with the highest priority won't schedule */
6900 +                       r = cond_resched();
6901 +                       if (!r)
6902 +                               cpu_chill();
6903 +               } else {
6904 +                       dentry = parent;
6905 +               }
6906                 goto repeat;
6907         }
6908  }
6909 @@ -2321,7 +2333,7 @@ void d_delete(struct dentry * dentry)
6910         if (dentry->d_lockref.count == 1) {
6911                 if (!spin_trylock(&inode->i_lock)) {
6912                         spin_unlock(&dentry->d_lock);
6913 -                       cpu_relax();
6914 +                       cpu_chill();
6915                         goto again;
6916                 }
6917                 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
6918 @@ -2381,21 +2393,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n)
6919
6920  static void d_wait_lookup(struct dentry *dentry)
6921  {
6922 -       if (d_in_lookup(dentry)) {
6923 -               DECLARE_WAITQUEUE(wait, current);
6924 -               add_wait_queue(dentry->d_wait, &wait);
6925 -               do {
6926 -                       set_current_state(TASK_UNINTERRUPTIBLE);
6927 -                       spin_unlock(&dentry->d_lock);
6928 -                       schedule();
6929 -                       spin_lock(&dentry->d_lock);
6930 -               } while (d_in_lookup(dentry));
6931 -       }
6932 +       struct swait_queue __wait;
6933 +
6934 +       if (!d_in_lookup(dentry))
6935 +               return;
6936 +
6937 +       INIT_LIST_HEAD(&__wait.task_list);
6938 +       do {
6939 +               prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE);
6940 +               spin_unlock(&dentry->d_lock);
6941 +               schedule();
6942 +               spin_lock(&dentry->d_lock);
6943 +       } while (d_in_lookup(dentry));
6944 +       finish_swait(dentry->d_wait, &__wait);
6945  }
6946
6947  struct dentry *d_alloc_parallel(struct dentry *parent,
6948                                 const struct qstr *name,
6949 -                               wait_queue_head_t *wq)
6950 +                               struct swait_queue_head *wq)
6951  {
6952         unsigned int hash = name->hash;
6953         struct hlist_bl_head *b = in_lookup_hash(parent, hash);
6954 @@ -2504,7 +2519,7 @@ void __d_lookup_done(struct dentry *dentry)
6955         hlist_bl_lock(b);
6956         dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
6957         __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
6958 -       wake_up_all(dentry->d_wait);
6959 +       swake_up_all(dentry->d_wait);
6960         dentry->d_wait = NULL;
6961         hlist_bl_unlock(b);
6962         INIT_HLIST_NODE(&dentry->d_u.d_alias);
6963 @@ -3601,6 +3616,11 @@ EXPORT_SYMBOL(d_genocide);
6964
6965  void __init vfs_caches_init_early(void)
6966  {
6967 +       int i;
6968 +
6969 +       for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
6970 +               INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
6971 +
6972         dcache_init_early();
6973         inode_init_early();
6974  }
6975 diff --git a/fs/eventpoll.c b/fs/eventpoll.c
6976 index 10db91218933..42af0a06f657 100644
6977 --- a/fs/eventpoll.c
6978 +++ b/fs/eventpoll.c
6979 @@ -510,12 +510,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
6980   */
6981  static void ep_poll_safewake(wait_queue_head_t *wq)
6982  {
6983 -       int this_cpu = get_cpu();
6984 +       int this_cpu = get_cpu_light();
6985
6986         ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
6987                        ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
6988
6989 -       put_cpu();
6990 +       put_cpu_light();
6991  }
6992
6993  static void ep_remove_wait_queue(struct eppoll_entry *pwq)
6994 diff --git a/fs/exec.c b/fs/exec.c
6995 index 6fcfb3f7b137..751370a71ec5 100644
6996 --- a/fs/exec.c
6997 +++ b/fs/exec.c
6998 @@ -1012,12 +1012,14 @@ static int exec_mmap(struct mm_struct *mm)
6999                 }
7000         }
7001         task_lock(tsk);
7002 +       preempt_disable_rt();
7003         active_mm = tsk->active_mm;
7004         tsk->mm = mm;
7005         tsk->active_mm = mm;
7006         activate_mm(active_mm, mm);
7007         tsk->mm->vmacache_seqnum = 0;
7008         vmacache_flush(tsk);
7009 +       preempt_enable_rt();
7010         task_unlock(tsk);
7011         if (old_mm) {
7012                 up_read(&old_mm->mmap_sem);
7013 diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
7014 index 4ff9251e9d3a..8fe489ec2ef1 100644
7015 --- a/fs/fuse/dir.c
7016 +++ b/fs/fuse/dir.c
7017 @@ -1174,7 +1174,7 @@ static int fuse_direntplus_link(struct file *file,
7018         struct inode *dir = d_inode(parent);
7019         struct fuse_conn *fc;
7020         struct inode *inode;
7021 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7022 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7023
7024         if (!o->nodeid) {
7025                 /*
7026 diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
7027 index 684996c8a3a4..6e18a06aaabe 100644
7028 --- a/fs/jbd2/checkpoint.c
7029 +++ b/fs/jbd2/checkpoint.c
7030 @@ -116,6 +116,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
7031         nblocks = jbd2_space_needed(journal);
7032         while (jbd2_log_space_left(journal) < nblocks) {
7033                 write_unlock(&journal->j_state_lock);
7034 +               if (current->plug)
7035 +                       io_schedule();
7036                 mutex_lock(&journal->j_checkpoint_mutex);
7037
7038                 /*
7039 diff --git a/fs/namei.c b/fs/namei.c
7040 index adb04146df09..a89dfaf9f209 100644
7041 --- a/fs/namei.c
7042 +++ b/fs/namei.c
7043 @@ -1629,7 +1629,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
7044  {
7045         struct dentry *dentry = ERR_PTR(-ENOENT), *old;
7046         struct inode *inode = dir->d_inode;
7047 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7048 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7049
7050         inode_lock_shared(inode);
7051         /* Don't go there if it's already dead */
7052 @@ -3086,7 +3086,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
7053         struct dentry *dentry;
7054         int error, create_error = 0;
7055         umode_t mode = op->mode;
7056 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7057 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7058
7059         if (unlikely(IS_DEADDIR(dir_inode)))
7060                 return -ENOENT;
7061 diff --git a/fs/namespace.c b/fs/namespace.c
7062 index 7bb2cda3bfef..cf79b18e7b58 100644
7063 --- a/fs/namespace.c
7064 +++ b/fs/namespace.c
7065 @@ -14,6 +14,7 @@
7066  #include <linux/mnt_namespace.h>
7067  #include <linux/user_namespace.h>
7068  #include <linux/namei.h>
7069 +#include <linux/delay.h>
7070  #include <linux/security.h>
7071  #include <linux/idr.h>
7072  #include <linux/init.h>                /* init_rootfs */
7073 @@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m)
7074          * incremented count after it has set MNT_WRITE_HOLD.
7075          */
7076         smp_mb();
7077 -       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
7078 -               cpu_relax();
7079 +       while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
7080 +               preempt_enable();
7081 +               cpu_chill();
7082 +               preempt_disable();
7083 +       }
7084         /*
7085          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
7086          * be set to match its requirements. So we must not load that until
7087 diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
7088 index b9c65421ed81..03ffe8af8785 100644
7089 --- a/fs/nfs/delegation.c
7090 +++ b/fs/nfs/delegation.c
7091 @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode,
7092                 sp = state->owner;
7093                 /* Block nfs4_proc_unlck */
7094                 mutex_lock(&sp->so_delegreturn_mutex);
7095 -               seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
7096 +               seq = read_seqbegin(&sp->so_reclaim_seqlock);
7097                 err = nfs4_open_delegation_recall(ctx, state, stateid, type);
7098                 if (!err)
7099                         err = nfs_delegation_claim_locks(ctx, state, stateid);
7100 -               if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
7101 +               if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
7102                         err = -EAGAIN;
7103                 mutex_unlock(&sp->so_delegreturn_mutex);
7104                 put_nfs_open_context(ctx);
7105 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
7106 index 6bc5a68e39f1..ce6488e07a13 100644
7107 --- a/fs/nfs/dir.c
7108 +++ b/fs/nfs/dir.c
7109 @@ -485,7 +485,7 @@ static
7110  void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
7111  {
7112         struct qstr filename = QSTR_INIT(entry->name, entry->len);
7113 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7114 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7115         struct dentry *dentry;
7116         struct dentry *alias;
7117         struct inode *dir = d_inode(parent);
7118 @@ -1490,7 +1490,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
7119                     struct file *file, unsigned open_flags,
7120                     umode_t mode, int *opened)
7121  {
7122 -       DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7123 +       DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7124         struct nfs_open_context *ctx;
7125         struct dentry *res;
7126         struct iattr attr = { .ia_valid = ATTR_OPEN };
7127 @@ -1805,7 +1805,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
7128
7129         trace_nfs_rmdir_enter(dir, dentry);
7130         if (d_really_is_positive(dentry)) {
7131 +#ifdef CONFIG_PREEMPT_RT_BASE
7132 +               down(&NFS_I(d_inode(dentry))->rmdir_sem);
7133 +#else
7134                 down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
7135 +#endif
7136                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
7137                 /* Ensure the VFS deletes this inode */
7138                 switch (error) {
7139 @@ -1815,7 +1819,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
7140                 case -ENOENT:
7141                         nfs_dentry_handle_enoent(dentry);
7142                 }
7143 +#ifdef CONFIG_PREEMPT_RT_BASE
7144 +               up(&NFS_I(d_inode(dentry))->rmdir_sem);
7145 +#else
7146                 up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
7147 +#endif
7148         } else
7149                 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
7150         trace_nfs_rmdir_exit(dir, dentry, error);
7151 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
7152 index bf4ec5ecc97e..36cd5fc9192c 100644
7153 --- a/fs/nfs/inode.c
7154 +++ b/fs/nfs/inode.c
7155 @@ -1957,7 +1957,11 @@ static void init_once(void *foo)
7156         nfsi->nrequests = 0;
7157         nfsi->commit_info.ncommit = 0;
7158         atomic_set(&nfsi->commit_info.rpcs_out, 0);
7159 +#ifdef CONFIG_PREEMPT_RT_BASE
7160 +       sema_init(&nfsi->rmdir_sem, 1);
7161 +#else
7162         init_rwsem(&nfsi->rmdir_sem);
7163 +#endif
7164         nfs4_init_once(nfsi);
7165  }
7166
7167 diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
7168 index 9bf64eacba5b..041da5cb80f5 100644
7169 --- a/fs/nfs/nfs4_fs.h
7170 +++ b/fs/nfs/nfs4_fs.h
7171 @@ -107,7 +107,7 @@ struct nfs4_state_owner {
7172         unsigned long        so_flags;
7173         struct list_head     so_states;
7174         struct nfs_seqid_counter so_seqid;
7175 -       seqcount_t           so_reclaim_seqcount;
7176 +       seqlock_t            so_reclaim_seqlock;
7177         struct mutex         so_delegreturn_mutex;
7178  };
7179
7180 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
7181 index a9dec32ba9ba..49b64dfb307c 100644
7182 --- a/fs/nfs/nfs4proc.c
7183 +++ b/fs/nfs/nfs4proc.c
7184 @@ -2525,7 +2525,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
7185         unsigned int seq;
7186         int ret;
7187
7188 -       seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
7189 +       seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
7190
7191         ret = _nfs4_proc_open(opendata);
7192         if (ret != 0)
7193 @@ -2561,7 +2561,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
7194         ctx->state = state;
7195         if (d_inode(dentry) == state->inode) {
7196                 nfs_inode_attach_open_context(ctx);
7197 -               if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
7198 +               if (read_seqretry(&sp->so_reclaim_seqlock, seq))
7199                         nfs4_schedule_stateid_recovery(server, state);
7200         }
7201  out:
7202 diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
7203 index 8353f33f0466..657e13ed4b5d 100644
7204 --- a/fs/nfs/nfs4state.c
7205 +++ b/fs/nfs/nfs4state.c
7206 @@ -488,7 +488,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
7207         nfs4_init_seqid_counter(&sp->so_seqid);
7208         atomic_set(&sp->so_count, 1);
7209         INIT_LIST_HEAD(&sp->so_lru);
7210 -       seqcount_init(&sp->so_reclaim_seqcount);
7211 +       seqlock_init(&sp->so_reclaim_seqlock);
7212         mutex_init(&sp->so_delegreturn_mutex);
7213         return sp;
7214  }
7215 @@ -1459,8 +1459,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
7216          * recovering after a network partition or a reboot from a
7217          * server that doesn't support a grace period.
7218          */
7219 +#ifdef CONFIG_PREEMPT_RT_FULL
7220 +       write_seqlock(&sp->so_reclaim_seqlock);
7221 +#else
7222 +       write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
7223 +#endif
7224         spin_lock(&sp->so_lock);
7225 -       raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
7226  restart:
7227         list_for_each_entry(state, &sp->so_states, open_states) {
7228                 if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
7229 @@ -1528,14 +1532,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
7230                 spin_lock(&sp->so_lock);
7231                 goto restart;
7232         }
7233 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
7234         spin_unlock(&sp->so_lock);
7235 +#ifdef CONFIG_PREEMPT_RT_FULL
7236 +       write_sequnlock(&sp->so_reclaim_seqlock);
7237 +#else
7238 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
7239 +#endif
7240         return 0;
7241  out_err:
7242         nfs4_put_open_state(state);
7243 -       spin_lock(&sp->so_lock);
7244 -       raw_write_seqcount_end(&sp->so_reclaim_seqcount);
7245 -       spin_unlock(&sp->so_lock);
7246 +#ifdef CONFIG_PREEMPT_RT_FULL
7247 +       write_sequnlock(&sp->so_reclaim_seqlock);
7248 +#else
7249 +       write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
7250 +#endif
7251         return status;
7252  }
7253
7254 diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
7255 index 191aa577dd1f..58990c8f52e0 100644
7256 --- a/fs/nfs/unlink.c
7257 +++ b/fs/nfs/unlink.c
7258 @@ -12,7 +12,7 @@
7259  #include <linux/sunrpc/clnt.h>
7260  #include <linux/nfs_fs.h>
7261  #include <linux/sched.h>
7262 -#include <linux/wait.h>
7263 +#include <linux/swait.h>
7264  #include <linux/namei.h>
7265  #include <linux/fsnotify.h>
7266
7267 @@ -51,6 +51,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
7268                 rpc_restart_call_prepare(task);
7269  }
7270
7271 +#ifdef CONFIG_PREEMPT_RT_BASE
7272 +static void nfs_down_anon(struct semaphore *sema)
7273 +{
7274 +       down(sema);
7275 +}
7276 +
7277 +static void nfs_up_anon(struct semaphore *sema)
7278 +{
7279 +       up(sema);
7280 +}
7281 +
7282 +#else
7283 +static void nfs_down_anon(struct rw_semaphore *rwsem)
7284 +{
7285 +       down_read_non_owner(rwsem);
7286 +}
7287 +
7288 +static void nfs_up_anon(struct rw_semaphore *rwsem)
7289 +{
7290 +       up_read_non_owner(rwsem);
7291 +}
7292 +#endif
7293 +
7294  /**
7295   * nfs_async_unlink_release - Release the sillydelete data.
7296   * @task: rpc_task of the sillydelete
7297 @@ -64,7 +87,7 @@ static void nfs_async_unlink_release(void *calldata)
7298         struct dentry *dentry = data->dentry;
7299         struct super_block *sb = dentry->d_sb;
7300
7301 -       up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
7302 +       nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
7303         d_lookup_done(dentry);
7304         nfs_free_unlinkdata(data);
7305         dput(dentry);
7306 @@ -117,10 +140,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
7307         struct inode *dir = d_inode(dentry->d_parent);
7308         struct dentry *alias;
7309
7310 -       down_read_non_owner(&NFS_I(dir)->rmdir_sem);
7311 +       nfs_down_anon(&NFS_I(dir)->rmdir_sem);
7312         alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
7313         if (IS_ERR(alias)) {
7314 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
7315 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
7316                 return 0;
7317         }
7318         if (!d_in_lookup(alias)) {
7319 @@ -142,7 +165,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data)
7320                         ret = 0;
7321                 spin_unlock(&alias->d_lock);
7322                 dput(alias);
7323 -               up_read_non_owner(&NFS_I(dir)->rmdir_sem);
7324 +               nfs_up_anon(&NFS_I(dir)->rmdir_sem);
7325                 /*
7326                  * If we'd displaced old cached devname, free it.  At that
7327                  * point dentry is definitely not a root, so we won't need
7328 @@ -182,7 +205,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name)
7329                 goto out_free_name;
7330         }
7331         data->res.dir_attr = &data->dir_attr;
7332 -       init_waitqueue_head(&data->wq);
7333 +       init_swait_queue_head(&data->wq);
7334
7335         status = -EBUSY;
7336         spin_lock(&dentry->d_lock);
7337 diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
7338 index fe251f187ff8..e89da4fb14c2 100644
7339 --- a/fs/ntfs/aops.c
7340 +++ b/fs/ntfs/aops.c
7341 @@ -92,13 +92,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
7342                         ofs = 0;
7343                         if (file_ofs < init_size)
7344                                 ofs = init_size - file_ofs;
7345 -                       local_irq_save(flags);
7346 +                       local_irq_save_nort(flags);
7347                         kaddr = kmap_atomic(page);
7348                         memset(kaddr + bh_offset(bh) + ofs, 0,
7349                                         bh->b_size - ofs);
7350                         flush_dcache_page(page);
7351                         kunmap_atomic(kaddr);
7352 -                       local_irq_restore(flags);
7353 +                       local_irq_restore_nort(flags);
7354                 }
7355         } else {
7356                 clear_buffer_uptodate(bh);
7357 @@ -107,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
7358                                 "0x%llx.", (unsigned long long)bh->b_blocknr);
7359         }
7360         first = page_buffers(page);
7361 -       local_irq_save(flags);
7362 -       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
7363 +       flags = bh_uptodate_lock_irqsave(first);
7364         clear_buffer_async_read(bh);
7365         unlock_buffer(bh);
7366         tmp = bh;
7367 @@ -123,8 +122,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
7368                 }
7369                 tmp = tmp->b_this_page;
7370         } while (tmp != bh);
7371 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
7372 -       local_irq_restore(flags);
7373 +       bh_uptodate_unlock_irqrestore(first, flags);
7374         /*
7375          * If none of the buffers had errors then we can set the page uptodate,
7376          * but we first have to perform the post read mst fixups, if the
7377 @@ -145,13 +143,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
7378                 recs = PAGE_SIZE / rec_size;
7379                 /* Should have been verified before we got here... */
7380                 BUG_ON(!recs);
7381 -               local_irq_save(flags);
7382 +               local_irq_save_nort(flags);
7383                 kaddr = kmap_atomic(page);
7384                 for (i = 0; i < recs; i++)
7385                         post_read_mst_fixup((NTFS_RECORD*)(kaddr +
7386                                         i * rec_size), rec_size);
7387                 kunmap_atomic(kaddr);
7388 -               local_irq_restore(flags);
7389 +               local_irq_restore_nort(flags);
7390                 flush_dcache_page(page);
7391                 if (likely(page_uptodate && !PageError(page)))
7392                         SetPageUptodate(page);
7393 @@ -159,9 +157,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
7394         unlock_page(page);
7395         return;
7396  still_busy:
7397 -       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
7398 -       local_irq_restore(flags);
7399 -       return;
7400 +       bh_uptodate_unlock_irqrestore(first, flags);
7401  }
7402
7403  /**
7404 diff --git a/fs/proc/base.c b/fs/proc/base.c
7405 index ac0df4dde823..ad1a4723ffdd 100644
7406 --- a/fs/proc/base.c
7407 +++ b/fs/proc/base.c
7408 @@ -1819,7 +1819,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
7409
7410         child = d_hash_and_lookup(dir, &qname);
7411         if (!child) {
7412 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7413 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7414                 child = d_alloc_parallel(dir, &qname, &wq);
7415                 if (IS_ERR(child))
7416                         goto end_instantiate;
7417 diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
7418 index 1b93650dda2f..c553bf3ea541 100644
7419 --- a/fs/proc/proc_sysctl.c
7420 +++ b/fs/proc/proc_sysctl.c
7421 @@ -627,7 +627,7 @@ static bool proc_sys_fill_cache(struct file *file,
7422
7423         child = d_lookup(dir, &qname);
7424         if (!child) {
7425 -               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
7426 +               DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq);
7427                 child = d_alloc_parallel(dir, &qname, &wq);
7428                 if (IS_ERR(child))
7429                         return false;
7430 diff --git a/fs/timerfd.c b/fs/timerfd.c
7431 index 9ae4abb4110b..8644b67c48fd 100644
7432 --- a/fs/timerfd.c
7433 +++ b/fs/timerfd.c
7434 @@ -460,7 +460,10 @@ static int do_timerfd_settime(int ufd, int flags,
7435                                 break;
7436                 }
7437                 spin_unlock_irq(&ctx->wqh.lock);
7438 -               cpu_relax();
7439 +               if (isalarm(ctx))
7440 +                       hrtimer_wait_for_timer(&ctx->t.alarm.timer);
7441 +               else
7442 +                       hrtimer_wait_for_timer(&ctx->t.tmr);
7443         }
7444
7445         /*
7446 diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
7447 index 93b61b1f2beb..58270adb46ce 100644
7448 --- a/include/acpi/platform/aclinux.h
7449 +++ b/include/acpi/platform/aclinux.h
7450 @@ -131,6 +131,7 @@
7451
7452  #define acpi_cache_t                        struct kmem_cache
7453  #define acpi_spinlock                       spinlock_t *
7454 +#define acpi_raw_spinlock              raw_spinlock_t *
7455  #define acpi_cpu_flags                      unsigned long
7456
7457  /* Use native linux version of acpi_os_allocate_zeroed */
7458 @@ -149,6 +150,20 @@
7459  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id
7460  #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock
7461
7462 +#define acpi_os_create_raw_lock(__handle)                      \
7463 +({                                                             \
7464 +        raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock));   \
7465 +                                                               \
7466 +        if (lock) {                                            \
7467 +               *(__handle) = lock;                             \
7468 +               raw_spin_lock_init(*(__handle));                \
7469 +        }                                                      \
7470 +        lock ? AE_OK : AE_NO_MEMORY;                           \
7471 + })
7472 +
7473 +#define acpi_os_delete_raw_lock(__handle)      kfree(__handle)
7474 +
7475 +
7476  /*
7477   * OSL interfaces used by debugger/disassembler
7478   */
7479 diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
7480 index 6f96247226a4..fa53a21263c2 100644
7481 --- a/include/asm-generic/bug.h
7482 +++ b/include/asm-generic/bug.h
7483 @@ -215,6 +215,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
7484  # define WARN_ON_SMP(x)                        ({0;})
7485  #endif
7486
7487 +#ifdef CONFIG_PREEMPT_RT_BASE
7488 +# define BUG_ON_RT(c)                  BUG_ON(c)
7489 +# define BUG_ON_NONRT(c)               do { } while (0)
7490 +# define WARN_ON_RT(condition)         WARN_ON(condition)
7491 +# define WARN_ON_NONRT(condition)      do { } while (0)
7492 +# define WARN_ON_ONCE_NONRT(condition) do { } while (0)
7493 +#else
7494 +# define BUG_ON_RT(c)                  do { } while (0)
7495 +# define BUG_ON_NONRT(c)               BUG_ON(c)
7496 +# define WARN_ON_RT(condition)         do { } while (0)
7497 +# define WARN_ON_NONRT(condition)      WARN_ON(condition)
7498 +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition)
7499 +#endif
7500 +
7501  #endif /* __ASSEMBLY__ */
7502
7503  #endif
7504 diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
7505 index e43bbffb5b7a..c23892264109 100644
7506 --- a/include/linux/blk-mq.h
7507 +++ b/include/linux/blk-mq.h
7508 @@ -222,6 +222,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
7509
7510  struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
7511  struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
7512 +void __blk_mq_complete_request_remote_work(struct work_struct *work);
7513
7514  int blk_mq_request_started(struct request *rq);
7515  void blk_mq_start_request(struct request *rq);
7516 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
7517 index e79055c8b577..8583c1af14ad 100644
7518 --- a/include/linux/blkdev.h
7519 +++ b/include/linux/blkdev.h
7520 @@ -89,6 +89,7 @@ struct request {
7521         struct list_head queuelist;
7522         union {
7523                 struct call_single_data csd;
7524 +               struct work_struct work;
7525                 u64 fifo_time;
7526         };
7527
7528 @@ -467,7 +468,7 @@ struct request_queue {
7529         struct throtl_data *td;
7530  #endif
7531         struct rcu_head         rcu_head;
7532 -       wait_queue_head_t       mq_freeze_wq;
7533 +       struct swait_queue_head mq_freeze_wq;
7534         struct percpu_ref       q_usage_counter;
7535         struct list_head        all_q_node;
7536
7537 diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h
7538 index 8fdcb783197d..d07dbeec7bc1 100644
7539 --- a/include/linux/bottom_half.h
7540 +++ b/include/linux/bottom_half.h
7541 @@ -3,6 +3,39 @@
7542
7543  #include <linux/preempt.h>
7544
7545 +#ifdef CONFIG_PREEMPT_RT_FULL
7546 +
7547 +extern void __local_bh_disable(void);
7548 +extern void _local_bh_enable(void);
7549 +extern void __local_bh_enable(void);
7550 +
7551 +static inline void local_bh_disable(void)
7552 +{
7553 +       __local_bh_disable();
7554 +}
7555 +
7556 +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
7557 +{
7558 +       __local_bh_disable();
7559 +}
7560 +
7561 +static inline void local_bh_enable(void)
7562 +{
7563 +       __local_bh_enable();
7564 +}
7565 +
7566 +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
7567 +{
7568 +       __local_bh_enable();
7569 +}
7570 +
7571 +static inline void local_bh_enable_ip(unsigned long ip)
7572 +{
7573 +       __local_bh_enable();
7574 +}
7575 +
7576 +#else
7577 +
7578  #ifdef CONFIG_TRACE_IRQFLAGS
7579  extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
7580  #else
7581 @@ -30,5 +63,6 @@ static inline void local_bh_enable(void)
7582  {
7583         __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
7584  }
7585 +#endif
7586
7587  #endif /* _LINUX_BH_H */
7588 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
7589 index ebbacd14d450..be5e87f6360a 100644
7590 --- a/include/linux/buffer_head.h
7591 +++ b/include/linux/buffer_head.h
7592 @@ -75,8 +75,50 @@ struct buffer_head {
7593         struct address_space *b_assoc_map;      /* mapping this buffer is
7594                                                    associated with */
7595         atomic_t b_count;               /* users using this buffer_head */
7596 +#ifdef CONFIG_PREEMPT_RT_BASE
7597 +       spinlock_t b_uptodate_lock;
7598 +#if IS_ENABLED(CONFIG_JBD2)
7599 +       spinlock_t b_state_lock;
7600 +       spinlock_t b_journal_head_lock;
7601 +#endif
7602 +#endif
7603  };
7604
7605 +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh)
7606 +{
7607 +       unsigned long flags;
7608 +
7609 +#ifndef CONFIG_PREEMPT_RT_BASE
7610 +       local_irq_save(flags);
7611 +       bit_spin_lock(BH_Uptodate_Lock, &bh->b_state);
7612 +#else
7613 +       spin_lock_irqsave(&bh->b_uptodate_lock, flags);
7614 +#endif
7615 +       return flags;
7616 +}
7617 +
7618 +static inline void
7619 +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags)
7620 +{
7621 +#ifndef CONFIG_PREEMPT_RT_BASE
7622 +       bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state);
7623 +       local_irq_restore(flags);
7624 +#else
7625 +       spin_unlock_irqrestore(&bh->b_uptodate_lock, flags);
7626 +#endif
7627 +}
7628 +
7629 +static inline void buffer_head_init_locks(struct buffer_head *bh)
7630 +{
7631 +#ifdef CONFIG_PREEMPT_RT_BASE
7632 +       spin_lock_init(&bh->b_uptodate_lock);
7633 +#if IS_ENABLED(CONFIG_JBD2)
7634 +       spin_lock_init(&bh->b_state_lock);
7635 +       spin_lock_init(&bh->b_journal_head_lock);
7636 +#endif
7637 +#endif
7638 +}
7639 +
7640  /*
7641   * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
7642   * and buffer_foo() functions.
7643 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
7644 index 5b17de62c962..56027cc01a56 100644
7645 --- a/include/linux/cgroup-defs.h
7646 +++ b/include/linux/cgroup-defs.h
7647 @@ -16,6 +16,7 @@
7648  #include <linux/percpu-refcount.h>
7649  #include <linux/percpu-rwsem.h>
7650  #include <linux/workqueue.h>
7651 +#include <linux/swork.h>
7652
7653  #ifdef CONFIG_CGROUPS
7654
7655 @@ -137,6 +138,7 @@ struct cgroup_subsys_state {
7656         /* percpu_ref killing and RCU release */
7657         struct rcu_head rcu_head;
7658         struct work_struct destroy_work;
7659 +       struct swork_event destroy_swork;
7660  };
7661
7662  /*
7663 diff --git a/include/linux/completion.h b/include/linux/completion.h
7664 index 5d5aaae3af43..3bca1590e29f 100644
7665 --- a/include/linux/completion.h
7666 +++ b/include/linux/completion.h
7667 @@ -7,8 +7,7 @@
7668   * Atomic wait-for-completion handler data structures.
7669   * See kernel/sched/completion.c for details.
7670   */
7671 -
7672 -#include <linux/wait.h>
7673 +#include <linux/swait.h>
7674
7675  /*
7676   * struct completion - structure used to maintain state for a "completion"
7677 @@ -24,11 +23,11 @@
7678   */
7679  struct completion {
7680         unsigned int done;
7681 -       wait_queue_head_t wait;
7682 +       struct swait_queue_head wait;
7683  };
7684
7685  #define COMPLETION_INITIALIZER(work) \
7686 -       { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
7687 +       { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) }
7688
7689  #define COMPLETION_INITIALIZER_ONSTACK(work) \
7690         ({ init_completion(&work); work; })
7691 @@ -73,7 +72,7 @@ struct completion {
7692  static inline void init_completion(struct completion *x)
7693  {
7694         x->done = 0;
7695 -       init_waitqueue_head(&x->wait);
7696 +       init_swait_queue_head(&x->wait);
7697  }
7698
7699  /**
7700 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
7701 index 797d9c8e9a1b..6eabd9e8a98b 100644
7702 --- a/include/linux/cpu.h
7703 +++ b/include/linux/cpu.h
7704 @@ -201,6 +201,8 @@ extern void get_online_cpus(void);
7705  extern void put_online_cpus(void);
7706  extern void cpu_hotplug_disable(void);
7707  extern void cpu_hotplug_enable(void);
7708 +extern void pin_current_cpu(void);
7709 +extern void unpin_current_cpu(void);
7710  #define hotcpu_notifier(fn, pri)       cpu_notifier(fn, pri)
7711  #define __hotcpu_notifier(fn, pri)     __cpu_notifier(fn, pri)
7712  #define register_hotcpu_notifier(nb)   register_cpu_notifier(nb)
7713 @@ -218,6 +220,8 @@ static inline void cpu_hotplug_done(void) {}
7714  #define put_online_cpus()      do { } while (0)
7715  #define cpu_hotplug_disable()  do { } while (0)
7716  #define cpu_hotplug_enable()   do { } while (0)
7717 +static inline void pin_current_cpu(void) { }
7718 +static inline void unpin_current_cpu(void) { }
7719  #define hotcpu_notifier(fn, pri)       do { (void)(fn); } while (0)
7720  #define __hotcpu_notifier(fn, pri)     do { (void)(fn); } while (0)
7721  /* These aren't inline functions due to a GCC bug. */
7722 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
7723 index 5ff3e9a4fe5f..ed0431599fd7 100644
7724 --- a/include/linux/dcache.h
7725 +++ b/include/linux/dcache.h
7726 @@ -11,6 +11,7 @@
7727  #include <linux/rcupdate.h>
7728  #include <linux/lockref.h>
7729  #include <linux/stringhash.h>
7730 +#include <linux/wait.h>
7731
7732  struct path;
7733  struct vfsmount;
7734 @@ -100,7 +101,7 @@ struct dentry {
7735
7736         union {
7737                 struct list_head d_lru;         /* LRU list */
7738 -               wait_queue_head_t *d_wait;      /* in-lookup ones only */
7739 +               struct swait_queue_head *d_wait;        /* in-lookup ones only */
7740         };
7741         struct list_head d_child;       /* child of parent list */
7742         struct list_head d_subdirs;     /* our children */
7743 @@ -230,7 +231,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op
7744  extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
7745  extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
7746  extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
7747 -                                       wait_queue_head_t *);
7748 +                                       struct swait_queue_head *);
7749  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
7750  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
7751  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
7752 diff --git a/include/linux/delay.h b/include/linux/delay.h
7753 index a6ecb34cf547..37caab306336 100644
7754 --- a/include/linux/delay.h
7755 +++ b/include/linux/delay.h
7756 @@ -52,4 +52,10 @@ static inline void ssleep(unsigned int seconds)
7757         msleep(seconds * 1000);
7758  }
7759
7760 +#ifdef CONFIG_PREEMPT_RT_FULL
7761 +extern void cpu_chill(void);
7762 +#else
7763 +# define cpu_chill()   cpu_relax()
7764 +#endif
7765 +
7766  #endif /* defined(_LINUX_DELAY_H) */
7767 diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
7768 index 7d565afe35d2..8e31b4d245d2 100644
7769 --- a/include/linux/ftrace.h
7770 +++ b/include/linux/ftrace.h
7771 @@ -714,6 +714,7 @@ static inline void __ftrace_enabled_restore(int enabled)
7772  #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
7773  #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
7774
7775 +#ifdef CONFIG_USING_GET_LOCK_PARENT_IP
7776  static inline unsigned long get_lock_parent_ip(void)
7777  {
7778         unsigned long addr = CALLER_ADDR0;
7779 @@ -725,6 +726,7 @@ static inline unsigned long get_lock_parent_ip(void)
7780                 return addr;
7781         return CALLER_ADDR2;
7782  }
7783 +#endif
7784
7785  #ifdef CONFIG_IRQSOFF_TRACER
7786    extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
7787 diff --git a/include/linux/highmem.h b/include/linux/highmem.h
7788 index bb3f3297062a..a117a33ef72c 100644
7789 --- a/include/linux/highmem.h
7790 +++ b/include/linux/highmem.h
7791 @@ -7,6 +7,7 @@
7792  #include <linux/mm.h>
7793  #include <linux/uaccess.h>
7794  #include <linux/hardirq.h>
7795 +#include <linux/sched.h>
7796
7797  #include <asm/cacheflush.h>
7798
7799 @@ -65,7 +66,7 @@ static inline void kunmap(struct page *page)
7800
7801  static inline void *kmap_atomic(struct page *page)
7802  {
7803 -       preempt_disable();
7804 +       preempt_disable_nort();
7805         pagefault_disable();
7806         return page_address(page);
7807  }
7808 @@ -74,7 +75,7 @@ static inline void *kmap_atomic(struct page *page)
7809  static inline void __kunmap_atomic(void *addr)
7810  {
7811         pagefault_enable();
7812 -       preempt_enable();
7813 +       preempt_enable_nort();
7814  }
7815
7816  #define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
7817 @@ -86,32 +87,51 @@ static inline void __kunmap_atomic(void *addr)
7818
7819  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
7820
7821 +#ifndef CONFIG_PREEMPT_RT_FULL
7822  DECLARE_PER_CPU(int, __kmap_atomic_idx);
7823 +#endif
7824
7825  static inline int kmap_atomic_idx_push(void)
7826  {
7827 +#ifndef CONFIG_PREEMPT_RT_FULL
7828         int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
7829
7830 -#ifdef CONFIG_DEBUG_HIGHMEM
7831 +# ifdef CONFIG_DEBUG_HIGHMEM
7832         WARN_ON_ONCE(in_irq() && !irqs_disabled());
7833         BUG_ON(idx >= KM_TYPE_NR);
7834 -#endif
7835 +# endif
7836         return idx;
7837 +#else
7838 +       current->kmap_idx++;
7839 +       BUG_ON(current->kmap_idx > KM_TYPE_NR);
7840 +       return current->kmap_idx - 1;
7841 +#endif
7842  }
7843
7844  static inline int kmap_atomic_idx(void)
7845  {
7846 +#ifndef CONFIG_PREEMPT_RT_FULL
7847         return __this_cpu_read(__kmap_atomic_idx) - 1;
7848 +#else
7849 +       return current->kmap_idx - 1;
7850 +#endif
7851  }
7852
7853  static inline void kmap_atomic_idx_pop(void)
7854  {
7855 -#ifdef CONFIG_DEBUG_HIGHMEM
7856 +#ifndef CONFIG_PREEMPT_RT_FULL
7857 +# ifdef CONFIG_DEBUG_HIGHMEM
7858         int idx = __this_cpu_dec_return(__kmap_atomic_idx);
7859
7860         BUG_ON(idx < 0);
7861 -#else
7862 +# else
7863         __this_cpu_dec(__kmap_atomic_idx);
7864 +# endif
7865 +#else
7866 +       current->kmap_idx--;
7867 +# ifdef CONFIG_DEBUG_HIGHMEM
7868 +       BUG_ON(current->kmap_idx < 0);
7869 +# endif
7870  #endif
7871  }
7872
7873 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
7874 index 5e00f80b1535..65d0671f20b4 100644
7875 --- a/include/linux/hrtimer.h
7876 +++ b/include/linux/hrtimer.h
7877 @@ -87,6 +87,9 @@ enum hrtimer_restart {
7878   * @function:  timer expiry callback function
7879   * @base:      pointer to the timer base (per cpu and per clock)
7880   * @state:     state information (See bit values above)
7881 + * @cb_entry:  list entry to defer timers from hardirq context
7882 + * @irqsafe:   timer can run in hardirq context
7883 + * @praecox:   timer expiry time if expired at the time of programming
7884   * @is_rel:    Set if the timer was armed relative
7885   * @start_pid:  timer statistics field to store the pid of the task which
7886   *             started the timer
7887 @@ -103,6 +106,11 @@ struct hrtimer {
7888         enum hrtimer_restart            (*function)(struct hrtimer *);
7889         struct hrtimer_clock_base       *base;
7890         u8                              state;
7891 +       struct list_head                cb_entry;
7892 +       int                             irqsafe;
7893 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
7894 +       ktime_t                         praecox;
7895 +#endif
7896         u8                              is_rel;
7897  #ifdef CONFIG_TIMER_STATS
7898         int                             start_pid;
7899 @@ -123,11 +131,7 @@ struct hrtimer_sleeper {
7900         struct task_struct *task;
7901  };
7902
7903 -#ifdef CONFIG_64BIT
7904  # define HRTIMER_CLOCK_BASE_ALIGN      64
7905 -#else
7906 -# define HRTIMER_CLOCK_BASE_ALIGN      32
7907 -#endif
7908
7909  /**
7910   * struct hrtimer_clock_base - the timer base for a specific clock
7911 @@ -136,6 +140,7 @@ struct hrtimer_sleeper {
7912   *                     timer to a base on another cpu.
7913   * @clockid:           clock id for per_cpu support
7914   * @active:            red black tree root node for the active timers
7915 + * @expired:           list head for deferred timers.
7916   * @get_time:          function to retrieve the current time of the clock
7917   * @offset:            offset of this clock to the monotonic base
7918   */
7919 @@ -144,6 +149,7 @@ struct hrtimer_clock_base {
7920         int                     index;
7921         clockid_t               clockid;
7922         struct timerqueue_head  active;
7923 +       struct list_head        expired;
7924         ktime_t                 (*get_time)(void);
7925         ktime_t                 offset;
7926  } __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN)));
7927 @@ -187,6 +193,7 @@ struct hrtimer_cpu_base {
7928         raw_spinlock_t                  lock;
7929         seqcount_t                      seq;
7930         struct hrtimer                  *running;
7931 +       struct hrtimer                  *running_soft;
7932         unsigned int                    cpu;
7933         unsigned int                    active_bases;
7934         unsigned int                    clock_was_set_seq;
7935 @@ -203,6 +210,9 @@ struct hrtimer_cpu_base {
7936         unsigned int                    nr_hangs;
7937         unsigned int                    max_hang_time;
7938  #endif
7939 +#ifdef CONFIG_PREEMPT_RT_BASE
7940 +       wait_queue_head_t               wait;
7941 +#endif
7942         struct hrtimer_clock_base       clock_base[HRTIMER_MAX_CLOCK_BASES];
7943  } ____cacheline_aligned;
7944
7945 @@ -412,6 +422,13 @@ static inline void hrtimer_restart(struct hrtimer *timer)
7946         hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
7947  }
7948
7949 +/* Softirq preemption could deadlock timer removal */
7950 +#ifdef CONFIG_PREEMPT_RT_BASE
7951 +  extern void hrtimer_wait_for_timer(const struct hrtimer *timer);
7952 +#else
7953 +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0)
7954 +#endif
7955 +
7956  /* Query timers: */
7957  extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
7958
7959 @@ -436,7 +453,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
7960   * Helper function to check, whether the timer is running the callback
7961   * function
7962   */
7963 -static inline int hrtimer_callback_running(struct hrtimer *timer)
7964 +static inline int hrtimer_callback_running(const struct hrtimer *timer)
7965  {
7966         return timer->base->cpu_base->running == timer;
7967  }
7968 diff --git a/include/linux/idr.h b/include/linux/idr.h
7969 index 083d61e92706..5899796f50cb 100644
7970 --- a/include/linux/idr.h
7971 +++ b/include/linux/idr.h
7972 @@ -95,10 +95,14 @@ bool idr_is_empty(struct idr *idp);
7973   * Each idr_preload() should be matched with an invocation of this
7974   * function.  See idr_preload() for details.
7975   */
7976 +#ifdef CONFIG_PREEMPT_RT_FULL
7977 +void idr_preload_end(void);
7978 +#else
7979  static inline void idr_preload_end(void)
7980  {
7981         preempt_enable();
7982  }
7983 +#endif
7984
7985  /**
7986   * idr_find - return pointer for given id
7987 diff --git a/include/linux/init_task.h b/include/linux/init_task.h
7988 index f8834f820ec2..a688d5e19578 100644
7989 --- a/include/linux/init_task.h
7990 +++ b/include/linux/init_task.h
7991 @@ -148,6 +148,12 @@ extern struct task_group root_task_group;
7992  # define INIT_PERF_EVENTS(tsk)
7993  #endif
7994
7995 +#ifdef CONFIG_PREEMPT_RT_BASE
7996 +# define INIT_TIMER_LIST               .posix_timer_list = NULL,
7997 +#else
7998 +# define INIT_TIMER_LIST
7999 +#endif
8000 +
8001  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
8002  # define INIT_VTIME(tsk)                                               \
8003         .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),      \
8004 @@ -239,6 +245,7 @@ extern struct task_group root_task_group;
8005         .cpu_timers     = INIT_CPU_TIMERS(tsk.cpu_timers),              \
8006         .pi_lock        = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),        \
8007         .timer_slack_ns = 50000, /* 50 usec default slack */            \
8008 +       INIT_TIMER_LIST                                                 \
8009         .pids = {                                                       \
8010                 [PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),            \
8011                 [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),           \
8012 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
8013 index b6683f0ffc9f..c0a351daf736 100644
8014 --- a/include/linux/interrupt.h
8015 +++ b/include/linux/interrupt.h
8016 @@ -14,6 +14,7 @@
8017  #include <linux/hrtimer.h>
8018  #include <linux/kref.h>
8019  #include <linux/workqueue.h>
8020 +#include <linux/swork.h>
8021
8022  #include <linux/atomic.h>
8023  #include <asm/ptrace.h>
8024 @@ -61,6 +62,7 @@
8025   *                interrupt handler after suspending interrupts. For system
8026   *                wakeup devices users need to implement wakeup detection in
8027   *                their interrupt handlers.
8028 + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT)
8029   */
8030  #define IRQF_SHARED            0x00000080
8031  #define IRQF_PROBE_SHARED      0x00000100
8032 @@ -74,6 +76,7 @@
8033  #define IRQF_NO_THREAD         0x00010000
8034  #define IRQF_EARLY_RESUME      0x00020000
8035  #define IRQF_COND_SUSPEND      0x00040000
8036 +#define IRQF_NO_SOFTIRQ_CALL   0x00080000
8037
8038  #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
8039
8040 @@ -196,7 +199,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
8041  #ifdef CONFIG_LOCKDEP
8042  # define local_irq_enable_in_hardirq() do { } while (0)
8043  #else
8044 -# define local_irq_enable_in_hardirq() local_irq_enable()
8045 +# define local_irq_enable_in_hardirq() local_irq_enable_nort()
8046  #endif
8047
8048  extern void disable_irq_nosync(unsigned int irq);
8049 @@ -216,6 +219,7 @@ extern void resume_device_irqs(void);
8050   * struct irq_affinity_notify - context for notification of IRQ affinity changes
8051   * @irq:               Interrupt to which notification applies
8052   * @kref:              Reference count, for internal use
8053 + * @swork:             Swork item, for internal use
8054   * @work:              Work item, for internal use
8055   * @notify:            Function to be called on change.  This will be
8056   *                     called in process context.
8057 @@ -227,7 +231,11 @@ extern void resume_device_irqs(void);
8058  struct irq_affinity_notify {
8059         unsigned int irq;
8060         struct kref kref;
8061 +#ifdef CONFIG_PREEMPT_RT_BASE
8062 +       struct swork_event swork;
8063 +#else
8064         struct work_struct work;
8065 +#endif
8066         void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
8067         void (*release)(struct kref *ref);
8068  };
8069 @@ -398,9 +406,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
8070                                  bool state);
8071
8072  #ifdef CONFIG_IRQ_FORCED_THREADING
8073 +# ifndef CONFIG_PREEMPT_RT_BASE
8074  extern bool force_irqthreads;
8075 +# else
8076 +#  define force_irqthreads     (true)
8077 +# endif
8078  #else
8079 -#define force_irqthreads       (0)
8080 +#define force_irqthreads       (false)
8081  #endif
8082
8083  #ifndef __ARCH_SET_SOFTIRQ_PENDING
8084 @@ -457,9 +469,10 @@ struct softirq_action
8085         void    (*action)(struct softirq_action *);
8086  };
8087
8088 +#ifndef CONFIG_PREEMPT_RT_FULL
8089  asmlinkage void do_softirq(void);
8090  asmlinkage void __do_softirq(void);
8091 -
8092 +static inline void thread_do_softirq(void) { do_softirq(); }
8093  #ifdef __ARCH_HAS_DO_SOFTIRQ
8094  void do_softirq_own_stack(void);
8095  #else
8096 @@ -468,13 +481,25 @@ static inline void do_softirq_own_stack(void)
8097         __do_softirq();
8098  }
8099  #endif
8100 +#else
8101 +extern void thread_do_softirq(void);
8102 +#endif
8103
8104  extern void open_softirq(int nr, void (*action)(struct softirq_action *));
8105  extern void softirq_init(void);
8106  extern void __raise_softirq_irqoff(unsigned int nr);
8107 +#ifdef CONFIG_PREEMPT_RT_FULL
8108 +extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
8109 +#else
8110 +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
8111 +{
8112 +       __raise_softirq_irqoff(nr);
8113 +}
8114 +#endif
8115
8116  extern void raise_softirq_irqoff(unsigned int nr);
8117  extern void raise_softirq(unsigned int nr);
8118 +extern void softirq_check_pending_idle(void);
8119
8120  DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
8121
8122 @@ -496,8 +521,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
8123       to be executed on some cpu at least once after this.
8124     * If the tasklet is already scheduled, but its execution is still not
8125       started, it will be executed only once.
8126 -   * If this tasklet is already running on another CPU (or schedule is called
8127 -     from tasklet itself), it is rescheduled for later.
8128 +   * If this tasklet is already running on another CPU, it is rescheduled
8129 +     for later.
8130 +   * Schedule must not be called from the tasklet itself (a lockup occurs)
8131     * Tasklet is strictly serialized wrt itself, but not
8132       wrt another tasklets. If client needs some intertask synchronization,
8133       he makes it with spinlocks.
8134 @@ -522,27 +548,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data }
8135  enum
8136  {
8137         TASKLET_STATE_SCHED,    /* Tasklet is scheduled for execution */
8138 -       TASKLET_STATE_RUN       /* Tasklet is running (SMP only) */
8139 +       TASKLET_STATE_RUN,      /* Tasklet is running (SMP only) */
8140 +       TASKLET_STATE_PENDING   /* Tasklet is pending */
8141  };
8142
8143 -#ifdef CONFIG_SMP
8144 +#define TASKLET_STATEF_SCHED   (1 << TASKLET_STATE_SCHED)
8145 +#define TASKLET_STATEF_RUN     (1 << TASKLET_STATE_RUN)
8146 +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING)
8147 +
8148 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
8149  static inline int tasklet_trylock(struct tasklet_struct *t)
8150  {
8151         return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
8152  }
8153
8154 +static inline int tasklet_tryunlock(struct tasklet_struct *t)
8155 +{
8156 +       return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN;
8157 +}
8158 +
8159  static inline void tasklet_unlock(struct tasklet_struct *t)
8160  {
8161         smp_mb__before_atomic();
8162         clear_bit(TASKLET_STATE_RUN, &(t)->state);
8163  }
8164
8165 -static inline void tasklet_unlock_wait(struct tasklet_struct *t)
8166 -{
8167 -       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); }
8168 -}
8169 +extern void tasklet_unlock_wait(struct tasklet_struct *t);
8170 +
8171  #else
8172  #define tasklet_trylock(t) 1
8173 +#define tasklet_tryunlock(t)   1
8174  #define tasklet_unlock_wait(t) do { } while (0)
8175  #define tasklet_unlock(t) do { } while (0)
8176  #endif
8177 @@ -591,12 +626,7 @@ static inline void tasklet_disable(struct tasklet_struct *t)
8178         smp_mb();
8179  }
8180
8181 -static inline void tasklet_enable(struct tasklet_struct *t)
8182 -{
8183 -       smp_mb__before_atomic();
8184 -       atomic_dec(&t->count);
8185 -}
8186 -
8187 +extern void tasklet_enable(struct tasklet_struct *t);
8188  extern void tasklet_kill(struct tasklet_struct *t);
8189  extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
8190  extern void tasklet_init(struct tasklet_struct *t,
8191 @@ -627,6 +657,12 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer)
8192         tasklet_kill(&ttimer->tasklet);
8193  }
8194
8195 +#ifdef CONFIG_PREEMPT_RT_FULL
8196 +extern void softirq_early_init(void);
8197 +#else
8198 +static inline void softirq_early_init(void) { }
8199 +#endif
8200 +
8201  /*
8202   * Autoprobing for irqs:
8203   *
8204 diff --git a/include/linux/irq.h b/include/linux/irq.h
8205 index 0ac26c892fe2..ede85f106aef 100644
8206 --- a/include/linux/irq.h
8207 +++ b/include/linux/irq.h
8208 @@ -72,6 +72,7 @@ enum irqchip_irq_state;
8209   * IRQ_IS_POLLED               - Always polled by another interrupt. Exclude
8210   *                               it from the spurious interrupt detection
8211   *                               mechanism and from core side polling.
8212 + * IRQ_NO_SOFTIRQ_CALL         - No softirq processing in the irq thread context (RT)
8213   * IRQ_DISABLE_UNLAZY          - Disable lazy irq disable
8214   */
8215  enum {
8216 @@ -99,13 +100,14 @@ enum {
8217         IRQ_PER_CPU_DEVID       = (1 << 17),
8218         IRQ_IS_POLLED           = (1 << 18),
8219         IRQ_DISABLE_UNLAZY      = (1 << 19),
8220 +       IRQ_NO_SOFTIRQ_CALL     = (1 << 20),
8221  };
8222
8223  #define IRQF_MODIFY_MASK       \
8224         (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
8225          IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \
8226          IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \
8227 -        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY)
8228 +        IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL)
8229
8230  #define IRQ_NO_BALANCING_MASK  (IRQ_PER_CPU | IRQ_NO_BALANCING)
8231
8232 diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
8233 index 47b9ebd4a74f..2543aab05daa 100644
8234 --- a/include/linux/irq_work.h
8235 +++ b/include/linux/irq_work.h
8236 @@ -16,6 +16,7 @@
8237  #define IRQ_WORK_BUSY          2UL
8238  #define IRQ_WORK_FLAGS         3UL
8239  #define IRQ_WORK_LAZY          4UL /* Doesn't want IPI, wait for tick */
8240 +#define IRQ_WORK_HARD_IRQ      8UL /* Run hard IRQ context, even on RT */
8241
8242  struct irq_work {
8243         unsigned long flags;
8244 @@ -51,4 +52,10 @@ static inline bool irq_work_needs_cpu(void) { return false; }
8245  static inline void irq_work_run(void) { }
8246  #endif
8247
8248 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
8249 +void irq_work_tick_soft(void);
8250 +#else
8251 +static inline void irq_work_tick_soft(void) { }
8252 +#endif
8253 +
8254  #endif /* _LINUX_IRQ_WORK_H */
8255 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
8256 index b51beebf9804..219d9824f762 100644
8257 --- a/include/linux/irqdesc.h
8258 +++ b/include/linux/irqdesc.h
8259 @@ -64,6 +64,7 @@ struct irq_desc {
8260         unsigned int            irqs_unhandled;
8261         atomic_t                threads_handled;
8262         int                     threads_handled_last;
8263 +       u64                     random_ip;
8264         raw_spinlock_t          lock;
8265         struct cpumask          *percpu_enabled;
8266         const struct cpumask    *percpu_affinity;
8267 diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
8268 index 5dd1272d1ab2..9b77034f7c5e 100644
8269 --- a/include/linux/irqflags.h
8270 +++ b/include/linux/irqflags.h
8271 @@ -25,8 +25,6 @@
8272  # define trace_softirqs_enabled(p)     ((p)->softirqs_enabled)
8273  # define trace_hardirq_enter() do { current->hardirq_context++; } while (0)
8274  # define trace_hardirq_exit()  do { current->hardirq_context--; } while (0)
8275 -# define lockdep_softirq_enter()       do { current->softirq_context++; } while (0)
8276 -# define lockdep_softirq_exit()        do { current->softirq_context--; } while (0)
8277  # define INIT_TRACE_IRQFLAGS   .softirqs_enabled = 1,
8278  #else
8279  # define trace_hardirqs_on()           do { } while (0)
8280 @@ -39,9 +37,15 @@
8281  # define trace_softirqs_enabled(p)     0
8282  # define trace_hardirq_enter()         do { } while (0)
8283  # define trace_hardirq_exit()          do { } while (0)
8284 +# define INIT_TRACE_IRQFLAGS
8285 +#endif
8286 +
8287 +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL)
8288 +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0)
8289 +# define lockdep_softirq_exit()         do { current->softirq_context--; } while (0)
8290 +#else
8291  # define lockdep_softirq_enter()       do { } while (0)
8292  # define lockdep_softirq_exit()                do { } while (0)
8293 -# define INIT_TRACE_IRQFLAGS
8294  #endif
8295
8296  #if defined(CONFIG_IRQSOFF_TRACER) || \
8297 @@ -148,4 +152,23 @@
8298
8299  #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags)
8300
8301 +/*
8302 + * local_irq* variants depending on RT/!RT
8303 + */
8304 +#ifdef CONFIG_PREEMPT_RT_FULL
8305 +# define local_irq_disable_nort()      do { } while (0)
8306 +# define local_irq_enable_nort()       do { } while (0)
8307 +# define local_irq_save_nort(flags)    local_save_flags(flags)
8308 +# define local_irq_restore_nort(flags) (void)(flags)
8309 +# define local_irq_disable_rt()                local_irq_disable()
8310 +# define local_irq_enable_rt()         local_irq_enable()
8311 +#else
8312 +# define local_irq_disable_nort()      local_irq_disable()
8313 +# define local_irq_enable_nort()       local_irq_enable()
8314 +# define local_irq_save_nort(flags)    local_irq_save(flags)
8315 +# define local_irq_restore_nort(flags) local_irq_restore(flags)
8316 +# define local_irq_disable_rt()                do { } while (0)
8317 +# define local_irq_enable_rt()         do { } while (0)
8318 +#endif
8319 +
8320  #endif
8321 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
8322 index dfaa1f4dcb0c..d57dd06544a1 100644
8323 --- a/include/linux/jbd2.h
8324 +++ b/include/linux/jbd2.h
8325 @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
8326
8327  static inline void jbd_lock_bh_state(struct buffer_head *bh)
8328  {
8329 +#ifndef CONFIG_PREEMPT_RT_BASE
8330         bit_spin_lock(BH_State, &bh->b_state);
8331 +#else
8332 +       spin_lock(&bh->b_state_lock);
8333 +#endif
8334  }
8335
8336  static inline int jbd_trylock_bh_state(struct buffer_head *bh)
8337  {
8338 +#ifndef CONFIG_PREEMPT_RT_BASE
8339         return bit_spin_trylock(BH_State, &bh->b_state);
8340 +#else
8341 +       return spin_trylock(&bh->b_state_lock);
8342 +#endif
8343  }
8344
8345  static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
8346  {
8347 +#ifndef CONFIG_PREEMPT_RT_BASE
8348         return bit_spin_is_locked(BH_State, &bh->b_state);
8349 +#else
8350 +       return spin_is_locked(&bh->b_state_lock);
8351 +#endif
8352  }
8353
8354  static inline void jbd_unlock_bh_state(struct buffer_head *bh)
8355  {
8356 +#ifndef CONFIG_PREEMPT_RT_BASE
8357         bit_spin_unlock(BH_State, &bh->b_state);
8358 +#else
8359 +       spin_unlock(&bh->b_state_lock);
8360 +#endif
8361  }
8362
8363  static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
8364  {
8365 +#ifndef CONFIG_PREEMPT_RT_BASE
8366         bit_spin_lock(BH_JournalHead, &bh->b_state);
8367 +#else
8368 +       spin_lock(&bh->b_journal_head_lock);
8369 +#endif
8370  }
8371
8372  static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
8373  {
8374 +#ifndef CONFIG_PREEMPT_RT_BASE
8375         bit_spin_unlock(BH_JournalHead, &bh->b_state);
8376 +#else
8377 +       spin_unlock(&bh->b_journal_head_lock);
8378 +#endif
8379  }
8380
8381  #define J_ASSERT(assert)       BUG_ON(!(assert))
8382 diff --git a/include/linux/kdb.h b/include/linux/kdb.h
8383 index 410decacff8f..0861bebfc188 100644
8384 --- a/include/linux/kdb.h
8385 +++ b/include/linux/kdb.h
8386 @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
8387  extern __printf(1, 2) int kdb_printf(const char *, ...);
8388  typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...);
8389
8390 +#define in_kdb_printk()        (kdb_trap_printk)
8391  extern void kdb_init(int level);
8392
8393  /* Access to kdb specific polling devices */
8394 @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *,
8395  extern int kdb_unregister(char *);
8396  #else /* ! CONFIG_KGDB_KDB */
8397  static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; }
8398 +#define in_kdb_printk() (0)
8399  static inline void kdb_init(int level) {}
8400  static inline int kdb_register(char *cmd, kdb_func_t func, char *usage,
8401                                char *help, short minlen) { return 0; }
8402 diff --git a/include/linux/kernel.h b/include/linux/kernel.h
8403 index d96a6118d26a..37de2ce2d290 100644
8404 --- a/include/linux/kernel.h
8405 +++ b/include/linux/kernel.h
8406 @@ -194,6 +194,9 @@ extern int _cond_resched(void);
8407   */
8408  # define might_sleep() \
8409         do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
8410 +
8411 +# define might_sleep_no_state_check() \
8412 +       do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
8413  # define sched_annotate_sleep()        (current->task_state_change = 0)
8414  #else
8415    static inline void ___might_sleep(const char *file, int line,
8416 @@ -201,6 +204,7 @@ extern int _cond_resched(void);
8417    static inline void __might_sleep(const char *file, int line,
8418                                    int preempt_offset) { }
8419  # define might_sleep() do { might_resched(); } while (0)
8420 +# define might_sleep_no_state_check() do { might_resched(); } while (0)
8421  # define sched_annotate_sleep() do { } while (0)
8422  #endif
8423
8424 @@ -491,6 +495,7 @@ extern enum system_states {
8425         SYSTEM_HALT,
8426         SYSTEM_POWER_OFF,
8427         SYSTEM_RESTART,
8428 +       SYSTEM_SUSPEND,
8429  } system_state;
8430
8431  #define TAINT_PROPRIETARY_MODULE       0
8432 diff --git a/include/linux/lglock.h b/include/linux/lglock.h
8433 index c92ebd100d9b..6f035f635d0e 100644
8434 --- a/include/linux/lglock.h
8435 +++ b/include/linux/lglock.h
8436 @@ -34,13 +34,30 @@
8437  #endif
8438
8439  struct lglock {
8440 +#ifdef CONFIG_PREEMPT_RT_FULL
8441 +       struct rt_mutex __percpu *lock;
8442 +#else
8443         arch_spinlock_t __percpu *lock;
8444 +#endif
8445  #ifdef CONFIG_DEBUG_LOCK_ALLOC
8446         struct lock_class_key lock_key;
8447         struct lockdep_map    lock_dep_map;
8448  #endif
8449  };
8450
8451 +#ifdef CONFIG_PREEMPT_RT_FULL
8452 +# define DEFINE_LGLOCK(name)                                           \
8453 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
8454 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
8455 +       struct lglock name = { .lock = &name ## _lock }
8456 +
8457 +# define DEFINE_STATIC_LGLOCK(name)                                    \
8458 +       static DEFINE_PER_CPU(struct rt_mutex, name ## _lock)           \
8459 +       = __RT_MUTEX_INITIALIZER( name ## _lock);                       \
8460 +       static struct lglock name = { .lock = &name ## _lock }
8461 +
8462 +#else
8463 +
8464  #define DEFINE_LGLOCK(name)                                            \
8465         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
8466         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
8467 @@ -50,6 +67,7 @@ struct lglock {
8468         static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
8469         = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
8470         static struct lglock name = { .lock = &name ## _lock }
8471 +#endif
8472
8473  void lg_lock_init(struct lglock *lg, char *name);
8474
8475 @@ -64,6 +82,12 @@ void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
8476  void lg_global_lock(struct lglock *lg);
8477  void lg_global_unlock(struct lglock *lg);
8478
8479 +#ifndef CONFIG_PREEMPT_RT_FULL
8480 +#define lg_global_trylock_relax(name)  lg_global_lock(name)
8481 +#else
8482 +void lg_global_trylock_relax(struct lglock *lg);
8483 +#endif
8484 +
8485  #else
8486  /* When !CONFIG_SMP, map lglock to spinlock */
8487  #define lglock spinlock
8488 diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
8489 index cb483305e1f5..4e5062316bb6 100644
8490 --- a/include/linux/list_bl.h
8491 +++ b/include/linux/list_bl.h
8492 @@ -2,6 +2,7 @@
8493  #define _LINUX_LIST_BL_H
8494
8495  #include <linux/list.h>
8496 +#include <linux/spinlock.h>
8497  #include <linux/bit_spinlock.h>
8498
8499  /*
8500 @@ -32,13 +33,24 @@
8501
8502  struct hlist_bl_head {
8503         struct hlist_bl_node *first;
8504 +#ifdef CONFIG_PREEMPT_RT_BASE
8505 +       raw_spinlock_t lock;
8506 +#endif
8507  };
8508
8509  struct hlist_bl_node {
8510         struct hlist_bl_node *next, **pprev;
8511  };
8512 -#define INIT_HLIST_BL_HEAD(ptr) \
8513 -       ((ptr)->first = NULL)
8514 +
8515 +#ifdef CONFIG_PREEMPT_RT_BASE
8516 +#define INIT_HLIST_BL_HEAD(h)          \
8517 +do {                                   \
8518 +       (h)->first = NULL;              \
8519 +       raw_spin_lock_init(&(h)->lock); \
8520 +} while (0)
8521 +#else
8522 +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
8523 +#endif
8524
8525  static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
8526  {
8527 @@ -118,12 +130,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
8528
8529  static inline void hlist_bl_lock(struct hlist_bl_head *b)
8530  {
8531 +#ifndef CONFIG_PREEMPT_RT_BASE
8532         bit_spin_lock(0, (unsigned long *)b);
8533 +#else
8534 +       raw_spin_lock(&b->lock);
8535 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
8536 +       __set_bit(0, (unsigned long *)b);
8537 +#endif
8538 +#endif
8539  }
8540
8541  static inline void hlist_bl_unlock(struct hlist_bl_head *b)
8542  {
8543 +#ifndef CONFIG_PREEMPT_RT_BASE
8544         __bit_spin_unlock(0, (unsigned long *)b);
8545 +#else
8546 +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
8547 +       __clear_bit(0, (unsigned long *)b);
8548 +#endif
8549 +       raw_spin_unlock(&b->lock);
8550 +#endif
8551  }
8552
8553  static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
8554 diff --git a/include/linux/locallock.h b/include/linux/locallock.h
8555 new file mode 100644
8556 index 000000000000..845c77f1a5ca
8557 --- /dev/null
8558 +++ b/include/linux/locallock.h
8559 @@ -0,0 +1,278 @@
8560 +#ifndef _LINUX_LOCALLOCK_H
8561 +#define _LINUX_LOCALLOCK_H
8562 +
8563 +#include <linux/percpu.h>
8564 +#include <linux/spinlock.h>
8565 +
8566 +#ifdef CONFIG_PREEMPT_RT_BASE
8567 +
8568 +#ifdef CONFIG_DEBUG_SPINLOCK
8569 +# define LL_WARN(cond) WARN_ON(cond)
8570 +#else
8571 +# define LL_WARN(cond) do { } while (0)
8572 +#endif
8573 +
8574 +/*
8575 + * per cpu lock based substitute for local_irq_*()
8576 + */
8577 +struct local_irq_lock {
8578 +       spinlock_t              lock;
8579 +       struct task_struct      *owner;
8580 +       int                     nestcnt;
8581 +       unsigned long           flags;
8582 +};
8583 +
8584 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)                                    \
8585 +       DEFINE_PER_CPU(struct local_irq_lock, lvar) = {                 \
8586 +               .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) }
8587 +
8588 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)                                   \
8589 +       DECLARE_PER_CPU(struct local_irq_lock, lvar)
8590 +
8591 +#define local_irq_lock_init(lvar)                                      \
8592 +       do {                                                            \
8593 +               int __cpu;                                              \
8594 +               for_each_possible_cpu(__cpu)                            \
8595 +                       spin_lock_init(&per_cpu(lvar, __cpu).lock);     \
8596 +       } while (0)
8597 +
8598 +/*
8599 + * spin_lock|trylock|unlock_local flavour that does not migrate disable
8600 + * used for __local_lock|trylock|unlock where get_local_var/put_local_var
8601 + * already takes care of the migrate_disable/enable
8602 + * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
8603 + */
8604 +#ifdef CONFIG_PREEMPT_RT_FULL
8605 +# define spin_lock_local(lock)                 rt_spin_lock__no_mg(lock)
8606 +# define spin_trylock_local(lock)              rt_spin_trylock__no_mg(lock)
8607 +# define spin_unlock_local(lock)               rt_spin_unlock__no_mg(lock)
8608 +#else
8609 +# define spin_lock_local(lock)                 spin_lock(lock)
8610 +# define spin_trylock_local(lock)              spin_trylock(lock)
8611 +# define spin_unlock_local(lock)               spin_unlock(lock)
8612 +#endif
8613 +
8614 +static inline void __local_lock(struct local_irq_lock *lv)
8615 +{
8616 +       if (lv->owner != current) {
8617 +               spin_lock_local(&lv->lock);
8618 +               LL_WARN(lv->owner);
8619 +               LL_WARN(lv->nestcnt);
8620 +               lv->owner = current;
8621 +       }
8622 +       lv->nestcnt++;
8623 +}
8624 +
8625 +#define local_lock(lvar)                                       \
8626 +       do { __local_lock(&get_local_var(lvar)); } while (0)
8627 +
8628 +#define local_lock_on(lvar, cpu)                               \
8629 +       do { __local_lock(&per_cpu(lvar, cpu)); } while (0)
8630 +
8631 +static inline int __local_trylock(struct local_irq_lock *lv)
8632 +{
8633 +       if (lv->owner != current && spin_trylock_local(&lv->lock)) {
8634 +               LL_WARN(lv->owner);
8635 +               LL_WARN(lv->nestcnt);
8636 +               lv->owner = current;
8637 +               lv->nestcnt = 1;
8638 +               return 1;
8639 +       }
8640 +       return 0;
8641 +}
8642 +
8643 +#define local_trylock(lvar)                                            \
8644 +       ({                                                              \
8645 +               int __locked;                                           \
8646 +               __locked = __local_trylock(&get_local_var(lvar));       \
8647 +               if (!__locked)                                          \
8648 +                       put_local_var(lvar);                            \
8649 +               __locked;                                               \
8650 +       })
8651 +
8652 +static inline void __local_unlock(struct local_irq_lock *lv)
8653 +{
8654 +       LL_WARN(lv->nestcnt == 0);
8655 +       LL_WARN(lv->owner != current);
8656 +       if (--lv->nestcnt)
8657 +               return;
8658 +
8659 +       lv->owner = NULL;
8660 +       spin_unlock_local(&lv->lock);
8661 +}
8662 +
8663 +#define local_unlock(lvar)                                     \
8664 +       do {                                                    \
8665 +               __local_unlock(this_cpu_ptr(&lvar));            \
8666 +               put_local_var(lvar);                            \
8667 +       } while (0)
8668 +
8669 +#define local_unlock_on(lvar, cpu)                       \
8670 +       do { __local_unlock(&per_cpu(lvar, cpu)); } while (0)
8671 +
8672 +static inline void __local_lock_irq(struct local_irq_lock *lv)
8673 +{
8674 +       spin_lock_irqsave(&lv->lock, lv->flags);
8675 +       LL_WARN(lv->owner);
8676 +       LL_WARN(lv->nestcnt);
8677 +       lv->owner = current;
8678 +       lv->nestcnt = 1;
8679 +}
8680 +
8681 +#define local_lock_irq(lvar)                                           \
8682 +       do { __local_lock_irq(&get_local_var(lvar)); } while (0)
8683 +
8684 +#define local_lock_irq_on(lvar, cpu)                                   \
8685 +       do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0)
8686 +
8687 +static inline void __local_unlock_irq(struct local_irq_lock *lv)
8688 +{
8689 +       LL_WARN(!lv->nestcnt);
8690 +       LL_WARN(lv->owner != current);
8691 +       lv->owner = NULL;
8692 +       lv->nestcnt = 0;
8693 +       spin_unlock_irq(&lv->lock);
8694 +}
8695 +
8696 +#define local_unlock_irq(lvar)                                         \
8697 +       do {                                                            \
8698 +               __local_unlock_irq(this_cpu_ptr(&lvar));                \
8699 +               put_local_var(lvar);                                    \
8700 +       } while (0)
8701 +
8702 +#define local_unlock_irq_on(lvar, cpu)                                 \
8703 +       do {                                                            \
8704 +               __local_unlock_irq(&per_cpu(lvar, cpu));                \
8705 +       } while (0)
8706 +
8707 +static inline int __local_lock_irqsave(struct local_irq_lock *lv)
8708 +{
8709 +       if (lv->owner != current) {
8710 +               __local_lock_irq(lv);
8711 +               return 0;
8712 +       } else {
8713 +               lv->nestcnt++;
8714 +               return 1;
8715 +       }
8716 +}
8717 +
8718 +#define local_lock_irqsave(lvar, _flags)                               \
8719 +       do {                                                            \
8720 +               if (__local_lock_irqsave(&get_local_var(lvar)))         \
8721 +                       put_local_var(lvar);                            \
8722 +               _flags = __this_cpu_read(lvar.flags);                   \
8723 +       } while (0)
8724 +
8725 +#define local_lock_irqsave_on(lvar, _flags, cpu)                       \
8726 +       do {                                                            \
8727 +               __local_lock_irqsave(&per_cpu(lvar, cpu));              \
8728 +               _flags = per_cpu(lvar, cpu).flags;                      \
8729 +       } while (0)
8730 +
8731 +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv,
8732 +                                           unsigned long flags)
8733 +{
8734 +       LL_WARN(!lv->nestcnt);
8735 +       LL_WARN(lv->owner != current);
8736 +       if (--lv->nestcnt)
8737 +               return 0;
8738 +
8739 +       lv->owner = NULL;
8740 +       spin_unlock_irqrestore(&lv->lock, lv->flags);
8741 +       return 1;
8742 +}
8743 +
8744 +#define local_unlock_irqrestore(lvar, flags)                           \
8745 +       do {                                                            \
8746 +               if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \
8747 +                       put_local_var(lvar);                            \
8748 +       } while (0)
8749 +
8750 +#define local_unlock_irqrestore_on(lvar, flags, cpu)                   \
8751 +       do {                                                            \
8752 +               __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags);  \
8753 +       } while (0)
8754 +
8755 +#define local_spin_trylock_irq(lvar, lock)                             \
8756 +       ({                                                              \
8757 +               int __locked;                                           \
8758 +               local_lock_irq(lvar);                                   \
8759 +               __locked = spin_trylock(lock);                          \
8760 +               if (!__locked)                                          \
8761 +                       local_unlock_irq(lvar);                         \
8762 +               __locked;                                               \
8763 +       })
8764 +
8765 +#define local_spin_lock_irq(lvar, lock)                                        \
8766 +       do {                                                            \
8767 +               local_lock_irq(lvar);                                   \
8768 +               spin_lock(lock);                                        \
8769 +       } while (0)
8770 +
8771 +#define local_spin_unlock_irq(lvar, lock)                              \
8772 +       do {                                                            \
8773 +               spin_unlock(lock);                                      \
8774 +               local_unlock_irq(lvar);                                 \
8775 +       } while (0)
8776 +
8777 +#define local_spin_lock_irqsave(lvar, lock, flags)                     \
8778 +       do {                                                            \
8779 +               local_lock_irqsave(lvar, flags);                        \
8780 +               spin_lock(lock);                                        \
8781 +       } while (0)
8782 +
8783 +#define local_spin_unlock_irqrestore(lvar, lock, flags)                        \
8784 +       do {                                                            \
8785 +               spin_unlock(lock);                                      \
8786 +               local_unlock_irqrestore(lvar, flags);                   \
8787 +       } while (0)
8788 +
8789 +#define get_locked_var(lvar, var)                                      \
8790 +       (*({                                                            \
8791 +               local_lock(lvar);                                       \
8792 +               this_cpu_ptr(&var);                                     \
8793 +       }))
8794 +
8795 +#define put_locked_var(lvar, var)      local_unlock(lvar);
8796 +
8797 +#define local_lock_cpu(lvar)                                           \
8798 +       ({                                                              \
8799 +               local_lock(lvar);                                       \
8800 +               smp_processor_id();                                     \
8801 +       })
8802 +
8803 +#define local_unlock_cpu(lvar)                 local_unlock(lvar)
8804 +
8805 +#else /* PREEMPT_RT_BASE */
8806 +
8807 +#define DEFINE_LOCAL_IRQ_LOCK(lvar)            __typeof__(const int) lvar
8808 +#define DECLARE_LOCAL_IRQ_LOCK(lvar)           extern __typeof__(const int) lvar
8809 +
8810 +static inline void local_irq_lock_init(int lvar) { }
8811 +
8812 +#define local_lock(lvar)                       preempt_disable()
8813 +#define local_unlock(lvar)                     preempt_enable()
8814 +#define local_lock_irq(lvar)                   local_irq_disable()
8815 +#define local_lock_irq_on(lvar, cpu)           local_irq_disable()
8816 +#define local_unlock_irq(lvar)                 local_irq_enable()
8817 +#define local_unlock_irq_on(lvar, cpu)         local_irq_enable()
8818 +#define local_lock_irqsave(lvar, flags)                local_irq_save(flags)
8819 +#define local_unlock_irqrestore(lvar, flags)   local_irq_restore(flags)
8820 +
8821 +#define local_spin_trylock_irq(lvar, lock)     spin_trylock_irq(lock)
8822 +#define local_spin_lock_irq(lvar, lock)                spin_lock_irq(lock)
8823 +#define local_spin_unlock_irq(lvar, lock)      spin_unlock_irq(lock)
8824 +#define local_spin_lock_irqsave(lvar, lock, flags)     \
8825 +       spin_lock_irqsave(lock, flags)
8826 +#define local_spin_unlock_irqrestore(lvar, lock, flags)        \
8827 +       spin_unlock_irqrestore(lock, flags)
8828 +
8829 +#define get_locked_var(lvar, var)              get_cpu_var(var)
8830 +#define put_locked_var(lvar, var)              put_cpu_var(var)
8831 +
8832 +#define local_lock_cpu(lvar)                   get_cpu()
8833 +#define local_unlock_cpu(lvar)                 put_cpu()
8834 +
8835 +#endif
8836 +
8837 +#endif
8838 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
8839 index 903200f4ec41..df670d441fc9 100644
8840 --- a/include/linux/mm_types.h
8841 +++ b/include/linux/mm_types.h
8842 @@ -11,6 +11,7 @@
8843  #include <linux/completion.h>
8844  #include <linux/cpumask.h>
8845  #include <linux/uprobes.h>
8846 +#include <linux/rcupdate.h>
8847  #include <linux/page-flags-layout.h>
8848  #include <linux/workqueue.h>
8849  #include <asm/page.h>
8850 @@ -508,6 +509,9 @@ struct mm_struct {
8851         bool tlb_flush_pending;
8852  #endif
8853         struct uprobes_state uprobes_state;
8854 +#ifdef CONFIG_PREEMPT_RT_BASE
8855 +       struct rcu_head delayed_drop;
8856 +#endif
8857  #ifdef CONFIG_X86_INTEL_MPX
8858         /* address of the bounds directory */
8859         void __user *bd_addr;
8860 diff --git a/include/linux/mutex.h b/include/linux/mutex.h
8861 index 2cb7531e7d7a..b3fdfc820216 100644
8862 --- a/include/linux/mutex.h
8863 +++ b/include/linux/mutex.h
8864 @@ -19,6 +19,17 @@
8865  #include <asm/processor.h>
8866  #include <linux/osq_lock.h>
8867
8868 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8869 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
8870 +       , .dep_map = { .name = #lockname }
8871 +#else
8872 +# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
8873 +#endif
8874 +
8875 +#ifdef CONFIG_PREEMPT_RT_FULL
8876 +# include <linux/mutex_rt.h>
8877 +#else
8878 +
8879  /*
8880   * Simple, straightforward mutexes with strict semantics:
8881   *
8882 @@ -99,13 +110,6 @@ do {                                                        \
8883  static inline void mutex_destroy(struct mutex *lock) {}
8884  #endif
8885
8886 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
8887 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
8888 -               , .dep_map = { .name = #lockname }
8889 -#else
8890 -# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
8891 -#endif
8892 -
8893  #define __MUTEX_INITIALIZER(lockname) \
8894                 { .count = ATOMIC_INIT(1) \
8895                 , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
8896 @@ -173,6 +177,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
8897  extern int mutex_trylock(struct mutex *lock);
8898  extern void mutex_unlock(struct mutex *lock);
8899
8900 +#endif /* !PREEMPT_RT_FULL */
8901 +
8902  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
8903
8904  #endif /* __LINUX_MUTEX_H */
8905 diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h
8906 new file mode 100644
8907 index 000000000000..c38a44b14da5
8908 --- /dev/null
8909 +++ b/include/linux/mutex_rt.h
8910 @@ -0,0 +1,84 @@
8911 +#ifndef __LINUX_MUTEX_RT_H
8912 +#define __LINUX_MUTEX_RT_H
8913 +
8914 +#ifndef __LINUX_MUTEX_H
8915 +#error "Please include mutex.h"
8916 +#endif
8917 +
8918 +#include <linux/rtmutex.h>
8919 +
8920 +/* FIXME: Just for __lockfunc */
8921 +#include <linux/spinlock.h>
8922 +
8923 +struct mutex {
8924 +       struct rt_mutex         lock;
8925 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8926 +       struct lockdep_map      dep_map;
8927 +#endif
8928 +};
8929 +
8930 +#define __MUTEX_INITIALIZER(mutexname)                                 \
8931 +       {                                                               \
8932 +               .lock = __RT_MUTEX_INITIALIZER(mutexname.lock)          \
8933 +               __DEP_MAP_MUTEX_INITIALIZER(mutexname)                  \
8934 +       }
8935 +
8936 +#define DEFINE_MUTEX(mutexname)                                                \
8937 +       struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
8938 +
8939 +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key);
8940 +extern void __lockfunc _mutex_lock(struct mutex *lock);
8941 +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock);
8942 +extern int __lockfunc _mutex_lock_killable(struct mutex *lock);
8943 +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass);
8944 +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
8945 +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass);
8946 +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass);
8947 +extern int __lockfunc _mutex_trylock(struct mutex *lock);
8948 +extern void __lockfunc _mutex_unlock(struct mutex *lock);
8949 +
8950 +#define mutex_is_locked(l)             rt_mutex_is_locked(&(l)->lock)
8951 +#define mutex_lock(l)                  _mutex_lock(l)
8952 +#define mutex_lock_interruptible(l)    _mutex_lock_interruptible(l)
8953 +#define mutex_lock_killable(l)         _mutex_lock_killable(l)
8954 +#define mutex_trylock(l)               _mutex_trylock(l)
8955 +#define mutex_unlock(l)                        _mutex_unlock(l)
8956 +#define mutex_destroy(l)               rt_mutex_destroy(&(l)->lock)
8957 +
8958 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
8959 +# define mutex_lock_nested(l, s)       _mutex_lock_nested(l, s)
8960 +# define mutex_lock_interruptible_nested(l, s) \
8961 +                                       _mutex_lock_interruptible_nested(l, s)
8962 +# define mutex_lock_killable_nested(l, s) \
8963 +                                       _mutex_lock_killable_nested(l, s)
8964 +
8965 +# define mutex_lock_nest_lock(lock, nest_lock)                         \
8966 +do {                                                                   \
8967 +       typecheck(struct lockdep_map *, &(nest_lock)->dep_map);         \
8968 +       _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);             \
8969 +} while (0)
8970 +
8971 +#else
8972 +# define mutex_lock_nested(l, s)       _mutex_lock(l)
8973 +# define mutex_lock_interruptible_nested(l, s) \
8974 +                                       _mutex_lock_interruptible(l)
8975 +# define mutex_lock_killable_nested(l, s) \
8976 +                                       _mutex_lock_killable(l)
8977 +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
8978 +#endif
8979 +
8980 +# define mutex_init(mutex)                             \
8981 +do {                                                   \
8982 +       static struct lock_class_key __key;             \
8983 +                                                       \
8984 +       rt_mutex_init(&(mutex)->lock);                  \
8985 +       __mutex_do_init((mutex), #mutex, &__key);       \
8986 +} while (0)
8987 +
8988 +# define __mutex_init(mutex, name, key)                        \
8989 +do {                                                   \
8990 +       rt_mutex_init(&(mutex)->lock);                  \
8991 +       __mutex_do_init((mutex), name, key);            \
8992 +} while (0)
8993 +
8994 +#endif
8995 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
8996 index e8d79d4ebcfe..2ae8fa187016 100644
8997 --- a/include/linux/netdevice.h
8998 +++ b/include/linux/netdevice.h
8999 @@ -2409,14 +2409,53 @@ void netdev_freemem(struct net_device *dev);
9000  void synchronize_net(void);
9001  int init_dummy_netdev(struct net_device *dev);
9002
9003 -DECLARE_PER_CPU(int, xmit_recursion);
9004  #define XMIT_RECURSION_LIMIT   10
9005 +#ifdef CONFIG_PREEMPT_RT_FULL
9006 +static inline int dev_recursion_level(void)
9007 +{
9008 +       return current->xmit_recursion;
9009 +}
9010 +
9011 +static inline int xmit_rec_read(void)
9012 +{
9013 +       return current->xmit_recursion;
9014 +}
9015 +
9016 +static inline void xmit_rec_inc(void)
9017 +{
9018 +       current->xmit_recursion++;
9019 +}
9020 +
9021 +static inline void xmit_rec_dec(void)
9022 +{
9023 +       current->xmit_recursion--;
9024 +}
9025 +
9026 +#else
9027 +
9028 +DECLARE_PER_CPU(int, xmit_recursion);
9029
9030  static inline int dev_recursion_level(void)
9031  {
9032         return this_cpu_read(xmit_recursion);
9033  }
9034
9035 +static inline int xmit_rec_read(void)
9036 +{
9037 +       return __this_cpu_read(xmit_recursion);
9038 +}
9039 +
9040 +static inline void xmit_rec_inc(void)
9041 +{
9042 +       __this_cpu_inc(xmit_recursion);
9043 +}
9044 +
9045 +static inline void xmit_rec_dec(void)
9046 +{
9047 +       __this_cpu_dec(xmit_recursion);
9048 +}
9049 +#endif
9050 +
9051  struct net_device *dev_get_by_index(struct net *net, int ifindex);
9052  struct net_device *__dev_get_by_index(struct net *net, int ifindex);
9053  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
9054 @@ -2794,6 +2833,7 @@ struct softnet_data {
9055         unsigned int            dropped;
9056         struct sk_buff_head     input_pkt_queue;
9057         struct napi_struct      backlog;
9058 +       struct sk_buff_head     tofree_queue;
9059
9060  };
9061
9062 diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
9063 index 2ad1a2b289b5..b4d10155af54 100644
9064 --- a/include/linux/netfilter/x_tables.h
9065 +++ b/include/linux/netfilter/x_tables.h
9066 @@ -4,6 +4,7 @@
9067
9068  #include <linux/netdevice.h>
9069  #include <linux/static_key.h>
9070 +#include <linux/locallock.h>
9071  #include <uapi/linux/netfilter/x_tables.h>
9072
9073  /* Test a struct->invflags and a boolean for inequality */
9074 @@ -300,6 +301,8 @@ void xt_free_table_info(struct xt_table_info *info);
9075   */
9076  DECLARE_PER_CPU(seqcount_t, xt_recseq);
9077
9078 +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock);
9079 +
9080  /* xt_tee_enabled - true if x_tables needs to handle reentrancy
9081   *
9082   * Enabled if current ip(6)tables ruleset has at least one -j TEE rule.
9083 @@ -320,6 +323,9 @@ static inline unsigned int xt_write_recseq_begin(void)
9084  {
9085         unsigned int addend;
9086
9087 +       /* RT protection */
9088 +       local_lock(xt_write_lock);
9089 +
9090         /*
9091          * Low order bit of sequence is set if we already
9092          * called xt_write_recseq_begin().
9093 @@ -350,6 +356,7 @@ static inline void xt_write_recseq_end(unsigned int addend)
9094         /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
9095         smp_wmb();
9096         __this_cpu_add(xt_recseq.sequence, addend);
9097 +       local_unlock(xt_write_lock);
9098  }
9099
9100  /*
9101 diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
9102 index 810124b33327..d54ca43d571f 100644
9103 --- a/include/linux/nfs_fs.h
9104 +++ b/include/linux/nfs_fs.h
9105 @@ -165,7 +165,11 @@ struct nfs_inode {
9106
9107         /* Readers: in-flight sillydelete RPC calls */
9108         /* Writers: rmdir */
9109 +#ifdef CONFIG_PREEMPT_RT_BASE
9110 +       struct semaphore        rmdir_sem;
9111 +#else
9112         struct rw_semaphore     rmdir_sem;
9113 +#endif
9114
9115  #if IS_ENABLED(CONFIG_NFS_V4)
9116         struct nfs4_cached_acl  *nfs4_acl;
9117 diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
9118 index 7cc0deee5bde..a20f49ee69ee 100644
9119 --- a/include/linux/nfs_xdr.h
9120 +++ b/include/linux/nfs_xdr.h
9121 @@ -1484,7 +1484,7 @@ struct nfs_unlinkdata {
9122         struct nfs_removeargs args;
9123         struct nfs_removeres res;
9124         struct dentry *dentry;
9125 -       wait_queue_head_t wq;
9126 +       struct swait_queue_head wq;
9127         struct rpc_cred *cred;
9128         struct nfs_fattr dir_attr;
9129         long timeout;
9130 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
9131 index 4149868de4e6..babe5b9bcb91 100644
9132 --- a/include/linux/notifier.h
9133 +++ b/include/linux/notifier.h
9134 @@ -6,7 +6,7 @@
9135   *
9136   *                             Alan Cox <Alan.Cox@linux.org>
9137   */
9138 -
9139 +
9140  #ifndef _LINUX_NOTIFIER_H
9141  #define _LINUX_NOTIFIER_H
9142  #include <linux/errno.h>
9143 @@ -42,9 +42,7 @@
9144   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
9145   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
9146   * SRCU notifier chains should be used when the chain will be called very
9147 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
9148 - * chains are slightly more difficult to use because they require special
9149 - * runtime initialization.
9150 + * often but notifier_blocks will seldom be removed.
9151   */
9152
9153  struct notifier_block;
9154 @@ -90,7 +88,7 @@ struct srcu_notifier_head {
9155                 (name)->head = NULL;            \
9156         } while (0)
9157
9158 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
9159 +/* srcu_notifier_heads must be cleaned up dynamically */
9160  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
9161  #define srcu_cleanup_notifier_head(name)       \
9162                 cleanup_srcu_struct(&(name)->srcu);
9163 @@ -103,7 +101,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
9164                 .head = NULL }
9165  #define RAW_NOTIFIER_INIT(name)        {                               \
9166                 .head = NULL }
9167 -/* srcu_notifier_heads cannot be initialized statically */
9168 +
9169 +#define SRCU_NOTIFIER_INIT(name, pcpu)                         \
9170 +       {                                                       \
9171 +               .mutex = __MUTEX_INITIALIZER(name.mutex),       \
9172 +               .head = NULL,                                   \
9173 +               .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),    \
9174 +       }
9175
9176  #define ATOMIC_NOTIFIER_HEAD(name)                             \
9177         struct atomic_notifier_head name =                      \
9178 @@ -115,6 +119,18 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
9179         struct raw_notifier_head name =                         \
9180                 RAW_NOTIFIER_INIT(name)
9181
9182 +#define _SRCU_NOTIFIER_HEAD(name, mod)                         \
9183 +       static DEFINE_PER_CPU(struct srcu_struct_array,         \
9184 +                       name##_head_srcu_array);                \
9185 +       mod struct srcu_notifier_head name =                    \
9186 +                       SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
9187 +
9188 +#define SRCU_NOTIFIER_HEAD(name)                               \
9189 +       _SRCU_NOTIFIER_HEAD(name, )
9190 +
9191 +#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
9192 +       _SRCU_NOTIFIER_HEAD(name, static)
9193 +
9194  #ifdef __KERNEL__
9195
9196  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
9197 @@ -184,12 +200,12 @@ static inline int notifier_to_errno(int ret)
9198
9199  /*
9200   *     Declared notifiers so far. I can imagine quite a few more chains
9201 - *     over time (eg laptop power reset chains, reboot chain (to clean
9202 + *     over time (eg laptop power reset chains, reboot chain (to clean
9203   *     device units up), device [un]mount chain, module load/unload chain,
9204 - *     low memory chain, screenblank chain (for plug in modular screenblankers)
9205 + *     low memory chain, screenblank chain (for plug in modular screenblankers)
9206   *     VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
9207   */
9208 -
9209 +
9210  /* CPU notfiers are defined in include/linux/cpu.h. */
9211
9212  /* netdevice notifiers are defined in include/linux/netdevice.h */
9213 diff --git a/include/linux/percpu.h b/include/linux/percpu.h
9214 index 56939d3f6e53..1c7e33fc83e4 100644
9215 --- a/include/linux/percpu.h
9216 +++ b/include/linux/percpu.h
9217 @@ -18,6 +18,35 @@
9218  #define PERCPU_MODULE_RESERVE          0
9219  #endif
9220
9221 +#ifdef CONFIG_PREEMPT_RT_FULL
9222 +
9223 +#define get_local_var(var) (*({        \
9224 +       migrate_disable();      \
9225 +       this_cpu_ptr(&var);     }))
9226 +
9227 +#define put_local_var(var) do {        \
9228 +       (void)&(var);           \
9229 +       migrate_enable();       \
9230 +} while (0)
9231 +
9232 +# define get_local_ptr(var) ({ \
9233 +       migrate_disable();      \
9234 +       this_cpu_ptr(var);      })
9235 +
9236 +# define put_local_ptr(var) do {       \
9237 +       (void)(var);                    \
9238 +       migrate_enable();               \
9239 +} while (0)
9240 +
9241 +#else
9242 +
9243 +#define get_local_var(var)     get_cpu_var(var)
9244 +#define put_local_var(var)     put_cpu_var(var)
9245 +#define get_local_ptr(var)     get_cpu_ptr(var)
9246 +#define put_local_ptr(var)     put_cpu_ptr(var)
9247 +
9248 +#endif
9249 +
9250  /* minimum unit size, also is the maximum supported allocation size */
9251  #define PCPU_MIN_UNIT_SIZE             PFN_ALIGN(32 << 10)
9252
9253 diff --git a/include/linux/pid.h b/include/linux/pid.h
9254 index 23705a53abba..2cc64b779f03 100644
9255 --- a/include/linux/pid.h
9256 +++ b/include/linux/pid.h
9257 @@ -2,6 +2,7 @@
9258  #define _LINUX_PID_H
9259
9260  #include <linux/rcupdate.h>
9261 +#include <linux/atomic.h>
9262
9263  enum pid_type
9264  {
9265 diff --git a/include/linux/preempt.h b/include/linux/preempt.h
9266 index 75e4e30677f1..1cfb1cb72354 100644
9267 --- a/include/linux/preempt.h
9268 +++ b/include/linux/preempt.h
9269 @@ -50,7 +50,11 @@
9270  #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
9271  #define NMI_OFFSET     (1UL << NMI_SHIFT)
9272
9273 -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
9274 +#ifndef CONFIG_PREEMPT_RT_FULL
9275 +# define SOFTIRQ_DISABLE_OFFSET                (2 * SOFTIRQ_OFFSET)
9276 +#else
9277 +# define SOFTIRQ_DISABLE_OFFSET                (0)
9278 +#endif
9279
9280  /* We use the MSB mostly because its available */
9281  #define PREEMPT_NEED_RESCHED   0x80000000
9282 @@ -59,9 +63,15 @@
9283  #include <asm/preempt.h>
9284
9285  #define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
9286 -#define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
9287  #define irq_count()    (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
9288                                  | NMI_MASK))
9289 +#ifndef CONFIG_PREEMPT_RT_FULL
9290 +# define softirq_count()       (preempt_count() & SOFTIRQ_MASK)
9291 +# define in_serving_softirq()  (softirq_count() & SOFTIRQ_OFFSET)
9292 +#else
9293 +# define softirq_count()       (0UL)
9294 +extern int in_serving_softirq(void);
9295 +#endif
9296
9297  /*
9298   * Are we doing bottom half or hardware interrupt processing?
9299 @@ -72,7 +82,6 @@
9300  #define in_irq()               (hardirq_count())
9301  #define in_softirq()           (softirq_count())
9302  #define in_interrupt()         (irq_count())
9303 -#define in_serving_softirq()   (softirq_count() & SOFTIRQ_OFFSET)
9304
9305  /*
9306   * Are we in NMI context?
9307 @@ -91,7 +100,11 @@
9308  /*
9309   * The preempt_count offset after spin_lock()
9310   */
9311 +#if !defined(CONFIG_PREEMPT_RT_FULL)
9312  #define PREEMPT_LOCK_OFFSET    PREEMPT_DISABLE_OFFSET
9313 +#else
9314 +#define PREEMPT_LOCK_OFFSET    0
9315 +#endif
9316
9317  /*
9318   * The preempt_count offset needed for things like:
9319 @@ -140,6 +153,20 @@ extern void preempt_count_sub(int val);
9320  #define preempt_count_inc() preempt_count_add(1)
9321  #define preempt_count_dec() preempt_count_sub(1)
9322
9323 +#ifdef CONFIG_PREEMPT_LAZY
9324 +#define add_preempt_lazy_count(val)    do { preempt_lazy_count() += (val); } while (0)
9325 +#define sub_preempt_lazy_count(val)    do { preempt_lazy_count() -= (val); } while (0)
9326 +#define inc_preempt_lazy_count()       add_preempt_lazy_count(1)
9327 +#define dec_preempt_lazy_count()       sub_preempt_lazy_count(1)
9328 +#define preempt_lazy_count()           (current_thread_info()->preempt_lazy_count)
9329 +#else
9330 +#define add_preempt_lazy_count(val)    do { } while (0)
9331 +#define sub_preempt_lazy_count(val)    do { } while (0)
9332 +#define inc_preempt_lazy_count()       do { } while (0)
9333 +#define dec_preempt_lazy_count()       do { } while (0)
9334 +#define preempt_lazy_count()           (0)
9335 +#endif
9336 +
9337  #ifdef CONFIG_PREEMPT_COUNT
9338
9339  #define preempt_disable() \
9340 @@ -148,13 +175,25 @@ do { \
9341         barrier(); \
9342  } while (0)
9343
9344 +#define preempt_lazy_disable() \
9345 +do { \
9346 +       inc_preempt_lazy_count(); \
9347 +       barrier(); \
9348 +} while (0)
9349 +
9350  #define sched_preempt_enable_no_resched() \
9351  do { \
9352         barrier(); \
9353         preempt_count_dec(); \
9354  } while (0)
9355
9356 -#define preempt_enable_no_resched() sched_preempt_enable_no_resched()
9357 +#ifdef CONFIG_PREEMPT_RT_BASE
9358 +# define preempt_enable_no_resched() sched_preempt_enable_no_resched()
9359 +# define preempt_check_resched_rt() preempt_check_resched()
9360 +#else
9361 +# define preempt_enable_no_resched() preempt_enable()
9362 +# define preempt_check_resched_rt() barrier();
9363 +#endif
9364
9365  #define preemptible()  (preempt_count() == 0 && !irqs_disabled())
9366
9367 @@ -179,6 +218,13 @@ do { \
9368                 __preempt_schedule(); \
9369  } while (0)
9370
9371 +#define preempt_lazy_enable() \
9372 +do { \
9373 +       dec_preempt_lazy_count(); \
9374 +       barrier(); \
9375 +       preempt_check_resched(); \
9376 +} while (0)
9377 +
9378  #else /* !CONFIG_PREEMPT */
9379  #define preempt_enable() \
9380  do { \
9381 @@ -224,6 +270,7 @@ do { \
9382  #define preempt_disable_notrace()              barrier()
9383  #define preempt_enable_no_resched_notrace()    barrier()
9384  #define preempt_enable_notrace()               barrier()
9385 +#define preempt_check_resched_rt()             barrier()
9386  #define preemptible()                          0
9387
9388  #endif /* CONFIG_PREEMPT_COUNT */
9389 @@ -244,10 +291,31 @@ do { \
9390  } while (0)
9391  #define preempt_fold_need_resched() \
9392  do { \
9393 -       if (tif_need_resched()) \
9394 +       if (tif_need_resched_now()) \
9395                 set_preempt_need_resched(); \
9396  } while (0)
9397
9398 +#ifdef CONFIG_PREEMPT_RT_FULL
9399 +# define preempt_disable_rt()          preempt_disable()
9400 +# define preempt_enable_rt()           preempt_enable()
9401 +# define preempt_disable_nort()                barrier()
9402 +# define preempt_enable_nort()         barrier()
9403 +# ifdef CONFIG_SMP
9404 +   extern void migrate_disable(void);
9405 +   extern void migrate_enable(void);
9406 +# else /* CONFIG_SMP */
9407 +#  define migrate_disable()            barrier()
9408 +#  define migrate_enable()             barrier()
9409 +# endif /* CONFIG_SMP */
9410 +#else
9411 +# define preempt_disable_rt()          barrier()
9412 +# define preempt_enable_rt()           barrier()
9413 +# define preempt_disable_nort()                preempt_disable()
9414 +# define preempt_enable_nort()         preempt_enable()
9415 +# define migrate_disable()             preempt_disable()
9416 +# define migrate_enable()              preempt_enable()
9417 +#endif
9418 +
9419  #ifdef CONFIG_PREEMPT_NOTIFIERS
9420
9421  struct preempt_notifier;
9422 diff --git a/include/linux/printk.h b/include/linux/printk.h
9423 index 696a56be7d3e..310aa321ef0c 100644
9424 --- a/include/linux/printk.h
9425 +++ b/include/linux/printk.h
9426 @@ -125,9 +125,11 @@ struct va_format {
9427  #ifdef CONFIG_EARLY_PRINTK
9428  extern asmlinkage __printf(1, 2)
9429  void early_printk(const char *fmt, ...);
9430 +extern void printk_kill(void);
9431  #else
9432  static inline __printf(1, 2) __cold
9433  void early_printk(const char *s, ...) { }
9434 +static inline void printk_kill(void) { }
9435  #endif
9436
9437  #ifdef CONFIG_PRINTK_NMI
9438 diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
9439 index 52b97db93830..fd9ea1c68db6 100644
9440 --- a/include/linux/radix-tree.h
9441 +++ b/include/linux/radix-tree.h
9442 @@ -289,9 +289,19 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
9443  unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
9444                         void ***results, unsigned long *indices,
9445                         unsigned long first_index, unsigned int max_items);
9446 +#ifdef CONFIG_PREEMPT_RT_FULL
9447 +static inline int radix_tree_preload(gfp_t gm) { return 0; }
9448 +static inline int radix_tree_maybe_preload(gfp_t gfp_mask) { return 0; }
9449 +static inline int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
9450 +{
9451 +       return 0;
9452 +};
9453 +
9454 +#else
9455  int radix_tree_preload(gfp_t gfp_mask);
9456  int radix_tree_maybe_preload(gfp_t gfp_mask);
9457  int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
9458 +#endif
9459  void radix_tree_init(void);
9460  void *radix_tree_tag_set(struct radix_tree_root *root,
9461                         unsigned long index, unsigned int tag);
9462 @@ -316,7 +326,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
9463
9464  static inline void radix_tree_preload_end(void)
9465  {
9466 -       preempt_enable();
9467 +       preempt_enable_nort();
9468  }
9469
9470  /**
9471 diff --git a/include/linux/random.h b/include/linux/random.h
9472 index 3d6e9815cd85..f6e8860b6494 100644
9473 --- a/include/linux/random.h
9474 +++ b/include/linux/random.h
9475 @@ -20,7 +20,7 @@ struct random_ready_callback {
9476  extern void add_device_randomness(const void *, unsigned int);
9477  extern void add_input_randomness(unsigned int type, unsigned int code,
9478                                  unsigned int value);
9479 -extern void add_interrupt_randomness(int irq, int irq_flags);
9480 +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip);
9481
9482  extern void get_random_bytes(void *buf, int nbytes);
9483  extern int add_random_ready_callback(struct random_ready_callback *rdy);
9484 diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
9485 index e585018498d5..25c64474fc27 100644
9486 --- a/include/linux/rbtree.h
9487 +++ b/include/linux/rbtree.h
9488 @@ -31,7 +31,7 @@
9489
9490  #include <linux/kernel.h>
9491  #include <linux/stddef.h>
9492 -#include <linux/rcupdate.h>
9493 +#include <linux/rcu_assign_pointer.h>
9494
9495  struct rb_node {
9496         unsigned long  __rb_parent_color;
9497 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
9498 index d076183e49be..36bfb4dd57ae 100644
9499 --- a/include/linux/rbtree_augmented.h
9500 +++ b/include/linux/rbtree_augmented.h
9501 @@ -26,6 +26,7 @@
9502
9503  #include <linux/compiler.h>
9504  #include <linux/rbtree.h>
9505 +#include <linux/rcupdate.h>
9506
9507  /*
9508   * Please note - only struct rb_augment_callbacks and the prototypes for
9509 diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h
9510 new file mode 100644
9511 index 000000000000..7066962a4379
9512 --- /dev/null
9513 +++ b/include/linux/rcu_assign_pointer.h
9514 @@ -0,0 +1,54 @@
9515 +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__
9516 +#define __LINUX_RCU_ASSIGN_POINTER_H__
9517 +#include <linux/compiler.h>
9518 +#include <asm/barrier.h>
9519 +
9520 +/**
9521 + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
9522 + * @v: The value to statically initialize with.
9523 + */
9524 +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
9525 +
9526 +/**
9527 + * rcu_assign_pointer() - assign to RCU-protected pointer
9528 + * @p: pointer to assign to
9529 + * @v: value to assign (publish)
9530 + *
9531 + * Assigns the specified value to the specified RCU-protected
9532 + * pointer, ensuring that any concurrent RCU readers will see
9533 + * any prior initialization.
9534 + *
9535 + * Inserts memory barriers on architectures that require them
9536 + * (which is most of them), and also prevents the compiler from
9537 + * reordering the code that initializes the structure after the pointer
9538 + * assignment.  More importantly, this call documents which pointers
9539 + * will be dereferenced by RCU read-side code.
9540 + *
9541 + * In some special cases, you may use RCU_INIT_POINTER() instead
9542 + * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
9543 + * to the fact that it does not constrain either the CPU or the compiler.
9544 + * That said, using RCU_INIT_POINTER() when you should have used
9545 + * rcu_assign_pointer() is a very bad thing that results in
9546 + * impossible-to-diagnose memory corruption.  So please be careful.
9547 + * See the RCU_INIT_POINTER() comment header for details.
9548 + *
9549 + * Note that rcu_assign_pointer() evaluates each of its arguments only
9550 + * once, appearances notwithstanding.  One of the "extra" evaluations
9551 + * is in typeof() and the other visible only to sparse (__CHECKER__),
9552 + * neither of which actually execute the argument.  As with most cpp
9553 + * macros, this execute-arguments-only-once property is important, so
9554 + * please be careful when making changes to rcu_assign_pointer() and the
9555 + * other macros that it invokes.
9556 + */
9557 +#define rcu_assign_pointer(p, v)                                             \
9558 +({                                                                           \
9559 +       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
9560 +                                                                             \
9561 +       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
9562 +               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
9563 +       else                                                                  \
9564 +               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
9565 +       _r_a_p__v;                                                            \
9566 +})
9567 +
9568 +#endif
9569 diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
9570 index 1aa62e1a761b..2a614acb433e 100644
9571 --- a/include/linux/rcupdate.h
9572 +++ b/include/linux/rcupdate.h
9573 @@ -46,6 +46,7 @@
9574  #include <linux/compiler.h>
9575  #include <linux/ktime.h>
9576  #include <linux/irqflags.h>
9577 +#include <linux/rcu_assign_pointer.h>
9578
9579  #include <asm/barrier.h>
9580
9581 @@ -178,6 +179,9 @@ void call_rcu(struct rcu_head *head,
9582
9583  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
9584
9585 +#ifdef CONFIG_PREEMPT_RT_FULL
9586 +#define call_rcu_bh    call_rcu
9587 +#else
9588  /**
9589   * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
9590   * @head: structure to be used for queueing the RCU updates.
9591 @@ -201,6 +205,7 @@ void call_rcu(struct rcu_head *head,
9592   */
9593  void call_rcu_bh(struct rcu_head *head,
9594                  rcu_callback_t func);
9595 +#endif
9596
9597  /**
9598   * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
9599 @@ -301,6 +306,11 @@ void synchronize_rcu(void);
9600   * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
9601   */
9602  #define rcu_preempt_depth() (current->rcu_read_lock_nesting)
9603 +#ifndef CONFIG_PREEMPT_RT_FULL
9604 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
9605 +#else
9606 +static inline int sched_rcu_preempt_depth(void) { return 0; }
9607 +#endif
9608
9609  #else /* #ifdef CONFIG_PREEMPT_RCU */
9610
9611 @@ -326,6 +336,8 @@ static inline int rcu_preempt_depth(void)
9612         return 0;
9613  }
9614
9615 +#define sched_rcu_preempt_depth()      rcu_preempt_depth()
9616 +
9617  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
9618
9619  /* Internal to kernel */
9620 @@ -500,7 +512,14 @@ extern struct lockdep_map rcu_callback_map;
9621  int debug_lockdep_rcu_enabled(void);
9622
9623  int rcu_read_lock_held(void);
9624 +#ifdef CONFIG_PREEMPT_RT_FULL
9625 +static inline int rcu_read_lock_bh_held(void)
9626 +{
9627 +       return rcu_read_lock_held();
9628 +}
9629 +#else
9630  int rcu_read_lock_bh_held(void);
9631 +#endif
9632
9633  /**
9634   * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
9635 @@ -621,54 +640,6 @@ static inline void rcu_preempt_sleep_check(void)
9636  })
9637
9638  /**
9639 - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
9640 - * @v: The value to statically initialize with.
9641 - */
9642 -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
9643 -
9644 -/**
9645 - * rcu_assign_pointer() - assign to RCU-protected pointer
9646 - * @p: pointer to assign to
9647 - * @v: value to assign (publish)
9648 - *
9649 - * Assigns the specified value to the specified RCU-protected
9650 - * pointer, ensuring that any concurrent RCU readers will see
9651 - * any prior initialization.
9652 - *
9653 - * Inserts memory barriers on architectures that require them
9654 - * (which is most of them), and also prevents the compiler from
9655 - * reordering the code that initializes the structure after the pointer
9656 - * assignment.  More importantly, this call documents which pointers
9657 - * will be dereferenced by RCU read-side code.
9658 - *
9659 - * In some special cases, you may use RCU_INIT_POINTER() instead
9660 - * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
9661 - * to the fact that it does not constrain either the CPU or the compiler.
9662 - * That said, using RCU_INIT_POINTER() when you should have used
9663 - * rcu_assign_pointer() is a very bad thing that results in
9664 - * impossible-to-diagnose memory corruption.  So please be careful.
9665 - * See the RCU_INIT_POINTER() comment header for details.
9666 - *
9667 - * Note that rcu_assign_pointer() evaluates each of its arguments only
9668 - * once, appearances notwithstanding.  One of the "extra" evaluations
9669 - * is in typeof() and the other visible only to sparse (__CHECKER__),
9670 - * neither of which actually execute the argument.  As with most cpp
9671 - * macros, this execute-arguments-only-once property is important, so
9672 - * please be careful when making changes to rcu_assign_pointer() and the
9673 - * other macros that it invokes.
9674 - */
9675 -#define rcu_assign_pointer(p, v)                                             \
9676 -({                                                                           \
9677 -       uintptr_t _r_a_p__v = (uintptr_t)(v);                                 \
9678 -                                                                             \
9679 -       if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)        \
9680 -               WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
9681 -       else                                                                  \
9682 -               smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
9683 -       _r_a_p__v;                                                            \
9684 -})
9685 -
9686 -/**
9687   * rcu_access_pointer() - fetch RCU pointer with no dereferencing
9688   * @p: The pointer to read
9689   *
9690 @@ -946,10 +917,14 @@ static inline void rcu_read_unlock(void)
9691  static inline void rcu_read_lock_bh(void)
9692  {
9693         local_bh_disable();
9694 +#ifdef CONFIG_PREEMPT_RT_FULL
9695 +       rcu_read_lock();
9696 +#else
9697         __acquire(RCU_BH);
9698         rcu_lock_acquire(&rcu_bh_lock_map);
9699         RCU_LOCKDEP_WARN(!rcu_is_watching(),
9700                          "rcu_read_lock_bh() used illegally while idle");
9701 +#endif
9702  }
9703
9704  /*
9705 @@ -959,10 +934,14 @@ static inline void rcu_read_lock_bh(void)
9706   */
9707  static inline void rcu_read_unlock_bh(void)
9708  {
9709 +#ifdef CONFIG_PREEMPT_RT_FULL
9710 +       rcu_read_unlock();
9711 +#else
9712         RCU_LOCKDEP_WARN(!rcu_is_watching(),
9713                          "rcu_read_unlock_bh() used illegally while idle");
9714         rcu_lock_release(&rcu_bh_lock_map);
9715         __release(RCU_BH);
9716 +#endif
9717         local_bh_enable();
9718  }
9719
9720 diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
9721 index 63a4e4cf40a5..08ab12df2863 100644
9722 --- a/include/linux/rcutree.h
9723 +++ b/include/linux/rcutree.h
9724 @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu)
9725         rcu_note_context_switch();
9726  }
9727
9728 +#ifdef CONFIG_PREEMPT_RT_FULL
9729 +# define synchronize_rcu_bh    synchronize_rcu
9730 +#else
9731  void synchronize_rcu_bh(void);
9732 +#endif
9733  void synchronize_sched_expedited(void);
9734  void synchronize_rcu_expedited(void);
9735
9736 @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void)
9737  }
9738
9739  void rcu_barrier(void);
9740 +#ifdef CONFIG_PREEMPT_RT_FULL
9741 +# define rcu_barrier_bh                rcu_barrier
9742 +#else
9743  void rcu_barrier_bh(void);
9744 +#endif
9745  void rcu_barrier_sched(void);
9746  unsigned long get_state_synchronize_rcu(void);
9747  void cond_synchronize_rcu(unsigned long oldstate);
9748 @@ -82,17 +90,14 @@ void cond_synchronize_sched(unsigned long oldstate);
9749  extern unsigned long rcutorture_testseq;
9750  extern unsigned long rcutorture_vernum;
9751  unsigned long rcu_batches_started(void);
9752 -unsigned long rcu_batches_started_bh(void);
9753  unsigned long rcu_batches_started_sched(void);
9754  unsigned long rcu_batches_completed(void);
9755 -unsigned long rcu_batches_completed_bh(void);
9756  unsigned long rcu_batches_completed_sched(void);
9757  unsigned long rcu_exp_batches_completed(void);
9758  unsigned long rcu_exp_batches_completed_sched(void);
9759  void show_rcu_gp_kthreads(void);
9760
9761  void rcu_force_quiescent_state(void);
9762 -void rcu_bh_force_quiescent_state(void);
9763  void rcu_sched_force_quiescent_state(void);
9764
9765  void rcu_idle_enter(void);
9766 @@ -109,6 +114,16 @@ extern int rcu_scheduler_active __read_mostly;
9767
9768  bool rcu_is_watching(void);
9769
9770 +#ifndef CONFIG_PREEMPT_RT_FULL
9771 +void rcu_bh_force_quiescent_state(void);
9772 +unsigned long rcu_batches_started_bh(void);
9773 +unsigned long rcu_batches_completed_bh(void);
9774 +#else
9775 +# define rcu_bh_force_quiescent_state  rcu_force_quiescent_state
9776 +# define rcu_batches_completed_bh      rcu_batches_completed
9777 +# define rcu_batches_started_bh                rcu_batches_completed
9778 +#endif
9779 +
9780  void rcu_all_qs(void);
9781
9782  /* RCUtree hotplug events */
9783 diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
9784 index 1abba5ce2a2f..30211c627511 100644
9785 --- a/include/linux/rtmutex.h
9786 +++ b/include/linux/rtmutex.h
9787 @@ -13,11 +13,15 @@
9788  #define __LINUX_RT_MUTEX_H
9789
9790  #include <linux/linkage.h>
9791 +#include <linux/spinlock_types_raw.h>
9792  #include <linux/rbtree.h>
9793 -#include <linux/spinlock_types.h>
9794
9795  extern int max_lock_depth; /* for sysctl */
9796
9797 +#ifdef CONFIG_DEBUG_MUTEXES
9798 +#include <linux/debug_locks.h>
9799 +#endif
9800 +
9801  /**
9802   * The rt_mutex structure
9803   *
9804 @@ -31,8 +35,8 @@ struct rt_mutex {
9805         struct rb_root          waiters;
9806         struct rb_node          *waiters_leftmost;
9807         struct task_struct      *owner;
9808 -#ifdef CONFIG_DEBUG_RT_MUTEXES
9809         int                     save_state;
9810 +#ifdef CONFIG_DEBUG_RT_MUTEXES
9811         const char              *name, *file;
9812         int                     line;
9813         void                    *magic;
9814 @@ -55,22 +59,33 @@ struct hrtimer_sleeper;
9815  # define rt_mutex_debug_check_no_locks_held(task)      do { } while (0)
9816  #endif
9817
9818 +# define rt_mutex_init(mutex)                                  \
9819 +       do {                                                    \
9820 +               raw_spin_lock_init(&(mutex)->wait_lock);        \
9821 +               __rt_mutex_init(mutex, #mutex);                 \
9822 +       } while (0)
9823 +
9824  #ifdef CONFIG_DEBUG_RT_MUTEXES
9825  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
9826         , .name = #mutexname, .file = __FILE__, .line = __LINE__
9827 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, __func__)
9828   extern void rt_mutex_debug_task_free(struct task_struct *tsk);
9829  #else
9830  # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
9831 -# define rt_mutex_init(mutex)                  __rt_mutex_init(mutex, NULL)
9832  # define rt_mutex_debug_task_free(t)                   do { } while (0)
9833  #endif
9834
9835 -#define __RT_MUTEX_INITIALIZER(mutexname) \
9836 -       { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
9837 +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \
9838 +        .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
9839         , .waiters = RB_ROOT \
9840         , .owner = NULL \
9841 -       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
9842 +       __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
9843 +
9844 +#define __RT_MUTEX_INITIALIZER(mutexname) \
9845 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) }
9846 +
9847 +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \
9848 +       { __RT_MUTEX_INITIALIZER_PLAIN(mutexname)    \
9849 +       , .save_state = 1 }
9850
9851  #define DEFINE_RT_MUTEX(mutexname) \
9852         struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
9853 @@ -91,6 +106,7 @@ extern void rt_mutex_destroy(struct rt_mutex *lock);
9854
9855  extern void rt_mutex_lock(struct rt_mutex *lock);
9856  extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
9857 +extern int rt_mutex_lock_killable(struct rt_mutex *lock);
9858  extern int rt_mutex_timed_lock(struct rt_mutex *lock,
9859                                struct hrtimer_sleeper *timeout);
9860
9861 diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h
9862 new file mode 100644
9863 index 000000000000..49ed2d45d3be
9864 --- /dev/null
9865 +++ b/include/linux/rwlock_rt.h
9866 @@ -0,0 +1,99 @@
9867 +#ifndef __LINUX_RWLOCK_RT_H
9868 +#define __LINUX_RWLOCK_RT_H
9869 +
9870 +#ifndef __LINUX_SPINLOCK_H
9871 +#error Do not include directly. Use spinlock.h
9872 +#endif
9873 +
9874 +#define rwlock_init(rwl)                               \
9875 +do {                                                   \
9876 +       static struct lock_class_key __key;             \
9877 +                                                       \
9878 +       rt_mutex_init(&(rwl)->lock);                    \
9879 +       __rt_rwlock_init(rwl, #rwl, &__key);            \
9880 +} while (0)
9881 +
9882 +extern void __lockfunc rt_write_lock(rwlock_t *rwlock);
9883 +extern void __lockfunc rt_read_lock(rwlock_t *rwlock);
9884 +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock);
9885 +extern int __lockfunc rt_write_trylock_irqsave(rwlock_t *trylock, unsigned long *flags);
9886 +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock);
9887 +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock);
9888 +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock);
9889 +extern unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock);
9890 +extern unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock);
9891 +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key);
9892 +
9893 +#define read_trylock(lock)     __cond_lock(lock, rt_read_trylock(lock))
9894 +#define write_trylock(lock)    __cond_lock(lock, rt_write_trylock(lock))
9895 +
9896 +#define write_trylock_irqsave(lock, flags)     \
9897 +       __cond_lock(lock, rt_write_trylock_irqsave(lock, &flags))
9898 +
9899 +#define read_lock_irqsave(lock, flags)                 \
9900 +       do {                                            \
9901 +               typecheck(unsigned long, flags);        \
9902 +               flags = rt_read_lock_irqsave(lock);     \
9903 +       } while (0)
9904 +
9905 +#define write_lock_irqsave(lock, flags)                        \
9906 +       do {                                            \
9907 +               typecheck(unsigned long, flags);        \
9908 +               flags = rt_write_lock_irqsave(lock);    \
9909 +       } while (0)
9910 +
9911 +#define read_lock(lock)                rt_read_lock(lock)
9912 +
9913 +#define read_lock_bh(lock)                             \
9914 +       do {                                            \
9915 +               local_bh_disable();                     \
9916 +               rt_read_lock(lock);                     \
9917 +       } while (0)
9918 +
9919 +#define read_lock_irq(lock)    read_lock(lock)
9920 +
9921 +#define write_lock(lock)       rt_write_lock(lock)
9922 +
9923 +#define write_lock_bh(lock)                            \
9924 +       do {                                            \
9925 +               local_bh_disable();                     \
9926 +               rt_write_lock(lock);                    \
9927 +       } while (0)
9928 +
9929 +#define write_lock_irq(lock)   write_lock(lock)
9930 +
9931 +#define read_unlock(lock)      rt_read_unlock(lock)
9932 +
9933 +#define read_unlock_bh(lock)                           \
9934 +       do {                                            \
9935 +               rt_read_unlock(lock);                   \
9936 +               local_bh_enable();                      \
9937 +       } while (0)
9938 +
9939 +#define read_unlock_irq(lock)  read_unlock(lock)
9940 +
9941 +#define write_unlock(lock)     rt_write_unlock(lock)
9942 +
9943 +#define write_unlock_bh(lock)                          \
9944 +       do {                                            \
9945 +               rt_write_unlock(lock);                  \
9946 +               local_bh_enable();                      \
9947 +       } while (0)
9948 +
9949 +#define write_unlock_irq(lock) write_unlock(lock)
9950 +
9951 +#define read_unlock_irqrestore(lock, flags)            \
9952 +       do {                                            \
9953 +               typecheck(unsigned long, flags);        \
9954 +               (void) flags;                           \
9955 +               rt_read_unlock(lock);                   \
9956 +       } while (0)
9957 +
9958 +#define write_unlock_irqrestore(lock, flags) \
9959 +       do {                                            \
9960 +               typecheck(unsigned long, flags);        \
9961 +               (void) flags;                           \
9962 +               rt_write_unlock(lock);                  \
9963 +       } while (0)
9964 +
9965 +#endif
9966 diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h
9967 index cc0072e93e36..5317cd957292 100644
9968 --- a/include/linux/rwlock_types.h
9969 +++ b/include/linux/rwlock_types.h
9970 @@ -1,6 +1,10 @@
9971  #ifndef __LINUX_RWLOCK_TYPES_H
9972  #define __LINUX_RWLOCK_TYPES_H
9973
9974 +#if !defined(__LINUX_SPINLOCK_TYPES_H)
9975 +# error "Do not include directly, include spinlock_types.h"
9976 +#endif
9977 +
9978  /*
9979   * include/linux/rwlock_types.h - generic rwlock type definitions
9980   *                               and initializers
9981 diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h
9982 new file mode 100644
9983 index 000000000000..51b28d775fe1
9984 --- /dev/null
9985 +++ b/include/linux/rwlock_types_rt.h
9986 @@ -0,0 +1,33 @@
9987 +#ifndef __LINUX_RWLOCK_TYPES_RT_H
9988 +#define __LINUX_RWLOCK_TYPES_RT_H
9989 +
9990 +#ifndef __LINUX_SPINLOCK_TYPES_H
9991 +#error "Do not include directly. Include spinlock_types.h instead"
9992 +#endif
9993 +
9994 +/*
9995 + * rwlocks - rtmutex which allows single reader recursion
9996 + */
9997 +typedef struct {
9998 +       struct rt_mutex         lock;
9999 +       int                     read_depth;
10000 +       unsigned int            break_lock;
10001 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10002 +       struct lockdep_map      dep_map;
10003 +#endif
10004 +} rwlock_t;
10005 +
10006 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10007 +# define RW_DEP_MAP_INIT(lockname)     .dep_map = { .name = #lockname }
10008 +#else
10009 +# define RW_DEP_MAP_INIT(lockname)
10010 +#endif
10011 +
10012 +#define __RW_LOCK_UNLOCKED(name) \
10013 +       { .lock = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.lock), \
10014 +         RW_DEP_MAP_INIT(name) }
10015 +
10016 +#define DEFINE_RWLOCK(name) \
10017 +       rwlock_t name = __RW_LOCK_UNLOCKED(name)
10018 +
10019 +#endif
10020 diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
10021 index dd1d14250340..8e1f44ff1f2f 100644
10022 --- a/include/linux/rwsem.h
10023 +++ b/include/linux/rwsem.h
10024 @@ -19,6 +19,10 @@
10025  #include <linux/osq_lock.h>
10026  #endif
10027
10028 +#ifdef CONFIG_PREEMPT_RT_FULL
10029 +#include <linux/rwsem_rt.h>
10030 +#else /* PREEMPT_RT_FULL */
10031 +
10032  struct rw_semaphore;
10033
10034  #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
10035 @@ -184,4 +188,6 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
10036  # define up_read_non_owner(sem)                        up_read(sem)
10037  #endif
10038
10039 +#endif /* !PREEMPT_RT_FULL */
10040 +
10041  #endif /* _LINUX_RWSEM_H */
10042 diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h
10043 new file mode 100644
10044 index 000000000000..e26bd95a57c3
10045 --- /dev/null
10046 +++ b/include/linux/rwsem_rt.h
10047 @@ -0,0 +1,167 @@
10048 +#ifndef _LINUX_RWSEM_RT_H
10049 +#define _LINUX_RWSEM_RT_H
10050 +
10051 +#ifndef _LINUX_RWSEM_H
10052 +#error "Include rwsem.h"
10053 +#endif
10054 +
10055 +/*
10056 + * RW-semaphores are a spinlock plus a reader-depth count.
10057 + *
10058 + * Note that the semantics are different from the usual
10059 + * Linux rw-sems, in PREEMPT_RT mode we do not allow
10060 + * multiple readers to hold the lock at once, we only allow
10061 + * a read-lock owner to read-lock recursively. This is
10062 + * better for latency, makes the implementation inherently
10063 + * fair and makes it simpler as well.
10064 + */
10065 +
10066 +#include <linux/rtmutex.h>
10067 +
10068 +struct rw_semaphore {
10069 +       struct rt_mutex         lock;
10070 +       int                     read_depth;
10071 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10072 +       struct lockdep_map      dep_map;
10073 +#endif
10074 +};
10075 +
10076 +#define __RWSEM_INITIALIZER(name) \
10077 +       { .lock = __RT_MUTEX_INITIALIZER(name.lock), \
10078 +         RW_DEP_MAP_INIT(name) }
10079 +
10080 +#define DECLARE_RWSEM(lockname) \
10081 +       struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
10082 +
10083 +extern void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
10084 +                                    struct lock_class_key *key);
10085 +
10086 +#define __rt_init_rwsem(sem, name, key)                        \
10087 +       do {                                            \
10088 +               rt_mutex_init(&(sem)->lock);            \
10089 +               __rt_rwsem_init((sem), (name), (key));\
10090 +       } while (0)
10091 +
10092 +#define __init_rwsem(sem, name, key) __rt_init_rwsem(sem, name, key)
10093 +
10094 +# define rt_init_rwsem(sem)                            \
10095 +do {                                                   \
10096 +       static struct lock_class_key __key;             \
10097 +                                                       \
10098 +       __rt_init_rwsem((sem), #sem, &__key);           \
10099 +} while (0)
10100 +
10101 +extern void rt_down_write(struct rw_semaphore *rwsem);
10102 +extern int  rt_down_write_killable(struct rw_semaphore *rwsem);
10103 +extern void rt_down_read_nested(struct rw_semaphore *rwsem, int subclass);
10104 +extern void rt_down_write_nested(struct rw_semaphore *rwsem, int subclass);
10105 +extern int  rt_down_write_killable_nested(struct rw_semaphore *rwsem,
10106 +                                         int subclass);
10107 +extern void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
10108 +                                     struct lockdep_map *nest);
10109 +extern void rt__down_read(struct rw_semaphore *rwsem);
10110 +extern void rt_down_read(struct rw_semaphore *rwsem);
10111 +extern int  rt_down_write_trylock(struct rw_semaphore *rwsem);
10112 +extern int  rt__down_read_trylock(struct rw_semaphore *rwsem);
10113 +extern int  rt_down_read_trylock(struct rw_semaphore *rwsem);
10114 +extern void __rt_up_read(struct rw_semaphore *rwsem);
10115 +extern void rt_up_read(struct rw_semaphore *rwsem);
10116 +extern void rt_up_write(struct rw_semaphore *rwsem);
10117 +extern void rt_downgrade_write(struct rw_semaphore *rwsem);
10118 +
10119 +#define init_rwsem(sem)                rt_init_rwsem(sem)
10120 +#define rwsem_is_locked(s)     rt_mutex_is_locked(&(s)->lock)
10121 +
10122 +static inline int rwsem_is_contended(struct rw_semaphore *sem)
10123 +{
10124 +       /* rt_mutex_has_waiters() */
10125 +       return !RB_EMPTY_ROOT(&sem->lock.waiters);
10126 +}
10127 +
10128 +static inline void __down_read(struct rw_semaphore *sem)
10129 +{
10130 +       rt__down_read(sem);
10131 +}
10132 +
10133 +static inline void down_read(struct rw_semaphore *sem)
10134 +{
10135 +       rt_down_read(sem);
10136 +}
10137 +
10138 +static inline int __down_read_trylock(struct rw_semaphore *sem)
10139 +{
10140 +       return rt__down_read_trylock(sem);
10141 +}
10142 +
10143 +static inline int down_read_trylock(struct rw_semaphore *sem)
10144 +{
10145 +       return rt_down_read_trylock(sem);
10146 +}
10147 +
10148 +static inline void down_write(struct rw_semaphore *sem)
10149 +{
10150 +       rt_down_write(sem);
10151 +}
10152 +
10153 +static inline int down_write_killable(struct rw_semaphore *sem)
10154 +{
10155 +       return rt_down_write_killable(sem);
10156 +}
10157 +
10158 +static inline int down_write_trylock(struct rw_semaphore *sem)
10159 +{
10160 +       return rt_down_write_trylock(sem);
10161 +}
10162 +
10163 +static inline void __up_read(struct rw_semaphore *sem)
10164 +{
10165 +       __rt_up_read(sem);
10166 +}
10167 +
10168 +static inline void up_read(struct rw_semaphore *sem)
10169 +{
10170 +       rt_up_read(sem);
10171 +}
10172 +
10173 +static inline void up_write(struct rw_semaphore *sem)
10174 +{
10175 +       rt_up_write(sem);
10176 +}
10177 +
10178 +static inline void downgrade_write(struct rw_semaphore *sem)
10179 +{
10180 +       rt_downgrade_write(sem);
10181 +}
10182 +
10183 +static inline void down_read_nested(struct rw_semaphore *sem, int subclass)
10184 +{
10185 +       return rt_down_read_nested(sem, subclass);
10186 +}
10187 +
10188 +static inline void down_write_nested(struct rw_semaphore *sem, int subclass)
10189 +{
10190 +       rt_down_write_nested(sem, subclass);
10191 +}
10192 +
10193 +static inline int down_write_killable_nested(struct rw_semaphore *sem,
10194 +                                            int subclass)
10195 +{
10196 +       return rt_down_write_killable_nested(sem, subclass);
10197 +}
10198 +
10199 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
10200 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
10201 +               struct rw_semaphore *nest_lock)
10202 +{
10203 +       rt_down_write_nested_lock(sem, &nest_lock->dep_map);
10204 +}
10205 +
10206 +#else
10207 +
10208 +static inline void down_write_nest_lock(struct rw_semaphore *sem,
10209 +               struct rw_semaphore *nest_lock)
10210 +{
10211 +       rt_down_write_nested_lock(sem, NULL);
10212 +}
10213 +#endif
10214 +#endif
10215 diff --git a/include/linux/sched.h b/include/linux/sched.h
10216 index 62c68e513e39..c873ce0183ab 100644
10217 --- a/include/linux/sched.h
10218 +++ b/include/linux/sched.h
10219 @@ -26,6 +26,7 @@ struct sched_param {
10220  #include <linux/nodemask.h>
10221  #include <linux/mm_types.h>
10222  #include <linux/preempt.h>
10223 +#include <asm/kmap_types.h>
10224
10225  #include <asm/page.h>
10226  #include <asm/ptrace.h>
10227 @@ -243,10 +244,7 @@ extern char ___assert_task_state[1 - 2*!!(
10228                                  TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
10229                                  __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
10230
10231 -#define task_is_traced(task)   ((task->state & __TASK_TRACED) != 0)
10232  #define task_is_stopped(task)  ((task->state & __TASK_STOPPED) != 0)
10233 -#define task_is_stopped_or_traced(task)        \
10234 -                       ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
10235  #define task_contributes_to_load(task) \
10236                                 ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
10237                                  (task->flags & PF_FROZEN) == 0 && \
10238 @@ -312,6 +310,11 @@ extern char ___assert_task_state[1 - 2*!!(
10239
10240  #endif
10241
10242 +#define __set_current_state_no_track(state_value)      \
10243 +       do { current->state = (state_value); } while (0)
10244 +#define set_current_state_no_track(state_value)                \
10245 +       set_mb(current->state, (state_value))
10246 +
10247  /* Task command name length */
10248  #define TASK_COMM_LEN 16
10249
10250 @@ -1009,8 +1012,18 @@ struct wake_q_head {
10251         struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
10252
10253  extern void wake_q_add(struct wake_q_head *head,
10254 -                      struct task_struct *task);
10255 -extern void wake_up_q(struct wake_q_head *head);
10256 +                             struct task_struct *task);
10257 +extern void __wake_up_q(struct wake_q_head *head, bool sleeper);
10258 +
10259 +static inline void wake_up_q(struct wake_q_head *head)
10260 +{
10261 +       __wake_up_q(head, false);
10262 +}
10263 +
10264 +static inline void wake_up_q_sleeper(struct wake_q_head *head)
10265 +{
10266 +       __wake_up_q(head, true);
10267 +}
10268
10269  /*
10270   * sched-domains (multiprocessor balancing) declarations:
10271 @@ -1459,6 +1472,7 @@ struct tlbflush_unmap_batch {
10272
10273  struct task_struct {
10274         volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
10275 +       volatile long saved_state;      /* saved state for "spinlock sleepers" */
10276         void *stack;
10277         atomic_t usage;
10278         unsigned int flags;     /* per process flags, defined below */
10279 @@ -1495,6 +1509,12 @@ struct task_struct {
10280  #endif
10281
10282         unsigned int policy;
10283 +#ifdef CONFIG_PREEMPT_RT_FULL
10284 +       int migrate_disable;
10285 +# ifdef CONFIG_SCHED_DEBUG
10286 +       int migrate_disable_atomic;
10287 +# endif
10288 +#endif
10289         int nr_cpus_allowed;
10290         cpumask_t cpus_allowed;
10291
10292 @@ -1629,6 +1649,9 @@ struct task_struct {
10293
10294         struct task_cputime cputime_expires;
10295         struct list_head cpu_timers[3];
10296 +#ifdef CONFIG_PREEMPT_RT_BASE
10297 +       struct task_struct *posix_timer_list;
10298 +#endif
10299
10300  /* process credentials */
10301         const struct cred __rcu *real_cred; /* objective and real subjective task
10302 @@ -1659,10 +1682,15 @@ struct task_struct {
10303  /* signal handlers */
10304         struct signal_struct *signal;
10305         struct sighand_struct *sighand;
10306 +       struct sigqueue *sigqueue_cache;
10307
10308         sigset_t blocked, real_blocked;
10309         sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
10310         struct sigpending pending;
10311 +#ifdef CONFIG_PREEMPT_RT_FULL
10312 +       /* TODO: move me into ->restart_block ? */
10313 +       struct siginfo forced_info;
10314 +#endif
10315
10316         unsigned long sas_ss_sp;
10317         size_t sas_ss_size;
10318 @@ -1891,6 +1919,12 @@ struct task_struct {
10319         /* bitmask and counter of trace recursion */
10320         unsigned long trace_recursion;
10321  #endif /* CONFIG_TRACING */
10322 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
10323 +       u64 preempt_timestamp_hist;
10324 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
10325 +       long timer_offset;
10326 +#endif
10327 +#endif
10328  #ifdef CONFIG_KCOV
10329         /* Coverage collection mode enabled for this task (0 if disabled). */
10330         enum kcov_mode kcov_mode;
10331 @@ -1916,9 +1950,23 @@ struct task_struct {
10332         unsigned int    sequential_io;
10333         unsigned int    sequential_io_avg;
10334  #endif
10335 +#ifdef CONFIG_PREEMPT_RT_BASE
10336 +       struct rcu_head put_rcu;
10337 +       int softirq_nestcnt;
10338 +       unsigned int softirqs_raised;
10339 +#endif
10340 +#ifdef CONFIG_PREEMPT_RT_FULL
10341 +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32
10342 +       int kmap_idx;
10343 +       pte_t kmap_pte[KM_TYPE_NR];
10344 +# endif
10345 +#endif
10346  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
10347         unsigned long   task_state_change;
10348  #endif
10349 +#ifdef CONFIG_PREEMPT_RT_FULL
10350 +       int xmit_recursion;
10351 +#endif
10352         int pagefault_disabled;
10353  #ifdef CONFIG_MMU
10354         struct task_struct *oom_reaper_list;
10355 @@ -1939,14 +1987,6 @@ extern int arch_task_struct_size __read_mostly;
10356  # define arch_task_struct_size (sizeof(struct task_struct))
10357  #endif
10358
10359 -/* Future-safe accessor for struct task_struct's cpus_allowed. */
10360 -#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
10361 -
10362 -static inline int tsk_nr_cpus_allowed(struct task_struct *p)
10363 -{
10364 -       return p->nr_cpus_allowed;
10365 -}
10366 -
10367  #define TNF_MIGRATED   0x01
10368  #define TNF_NO_GROUP   0x02
10369  #define TNF_SHARED     0x04
10370 @@ -2162,6 +2202,15 @@ extern struct pid *cad_pid;
10371  extern void free_task(struct task_struct *tsk);
10372  #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
10373
10374 +#ifdef CONFIG_PREEMPT_RT_BASE
10375 +extern void __put_task_struct_cb(struct rcu_head *rhp);
10376 +
10377 +static inline void put_task_struct(struct task_struct *t)
10378 +{
10379 +       if (atomic_dec_and_test(&t->usage))
10380 +               call_rcu(&t->put_rcu, __put_task_struct_cb);
10381 +}
10382 +#else
10383  extern void __put_task_struct(struct task_struct *t);
10384
10385  static inline void put_task_struct(struct task_struct *t)
10386 @@ -2169,6 +2218,7 @@ static inline void put_task_struct(struct task_struct *t)
10387         if (atomic_dec_and_test(&t->usage))
10388                 __put_task_struct(t);
10389  }
10390 +#endif
10391
10392  struct task_struct *task_rcu_dereference(struct task_struct **ptask);
10393  struct task_struct *try_get_task_struct(struct task_struct **ptask);
10394 @@ -2210,6 +2260,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
10395  /*
10396   * Per process flags
10397   */
10398 +#define PF_IN_SOFTIRQ  0x00000001      /* Task is serving softirq */
10399  #define PF_EXITING     0x00000004      /* getting shut down */
10400  #define PF_EXITPIDONE  0x00000008      /* pi exit done on shut down */
10401  #define PF_VCPU                0x00000010      /* I'm a virtual CPU */
10402 @@ -2378,6 +2429,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
10403
10404  extern int set_cpus_allowed_ptr(struct task_struct *p,
10405                                 const struct cpumask *new_mask);
10406 +int migrate_me(void);
10407 +void tell_sched_cpu_down_begin(int cpu);
10408 +void tell_sched_cpu_down_done(int cpu);
10409 +
10410  #else
10411  static inline void do_set_cpus_allowed(struct task_struct *p,
10412                                       const struct cpumask *new_mask)
10413 @@ -2390,6 +2445,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
10414                 return -EINVAL;
10415         return 0;
10416  }
10417 +static inline int migrate_me(void) { return 0; }
10418 +static inline void tell_sched_cpu_down_begin(int cpu) { }
10419 +static inline void tell_sched_cpu_down_done(int cpu) { }
10420  #endif
10421
10422  #ifdef CONFIG_NO_HZ_COMMON
10423 @@ -2624,6 +2682,7 @@ extern void xtime_update(unsigned long ticks);
10424
10425  extern int wake_up_state(struct task_struct *tsk, unsigned int state);
10426  extern int wake_up_process(struct task_struct *tsk);
10427 +extern int wake_up_lock_sleeper(struct task_struct * tsk);
10428  extern void wake_up_new_task(struct task_struct *tsk);
10429  #ifdef CONFIG_SMP
10430   extern void kick_process(struct task_struct *tsk);
10431 @@ -2832,6 +2891,17 @@ static inline void mmdrop(struct mm_struct *mm)
10432                 __mmdrop(mm);
10433  }
10434
10435 +#ifdef CONFIG_PREEMPT_RT_BASE
10436 +extern void __mmdrop_delayed(struct rcu_head *rhp);
10437 +static inline void mmdrop_delayed(struct mm_struct *mm)
10438 +{
10439 +       if (atomic_dec_and_test(&mm->mm_count))
10440 +               call_rcu(&mm->delayed_drop, __mmdrop_delayed);
10441 +}
10442 +#else
10443 +# define mmdrop_delayed(mm)    mmdrop(mm)
10444 +#endif
10445 +
10446  static inline bool mmget_not_zero(struct mm_struct *mm)
10447  {
10448         return atomic_inc_not_zero(&mm->mm_users);
10449 @@ -3168,6 +3238,43 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
10450         return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
10451  }
10452
10453 +#ifdef CONFIG_PREEMPT_LAZY
10454 +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk)
10455 +{
10456 +       set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
10457 +}
10458 +
10459 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk)
10460 +{
10461 +       clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY);
10462 +}
10463 +
10464 +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk)
10465 +{
10466 +       return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY));
10467 +}
10468 +
10469 +static inline int need_resched_lazy(void)
10470 +{
10471 +       return test_thread_flag(TIF_NEED_RESCHED_LAZY);
10472 +}
10473 +
10474 +static inline int need_resched_now(void)
10475 +{
10476 +       return test_thread_flag(TIF_NEED_RESCHED);
10477 +}
10478 +
10479 +#else
10480 +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { }
10481 +static inline int need_resched_lazy(void) { return 0; }
10482 +
10483 +static inline int need_resched_now(void)
10484 +{
10485 +       return test_thread_flag(TIF_NEED_RESCHED);
10486 +}
10487 +
10488 +#endif
10489 +
10490  static inline int restart_syscall(void)
10491  {
10492         set_tsk_thread_flag(current, TIF_SIGPENDING);
10493 @@ -3199,6 +3306,51 @@ static inline int signal_pending_state(long state, struct task_struct *p)
10494         return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
10495  }
10496
10497 +static inline bool __task_is_stopped_or_traced(struct task_struct *task)
10498 +{
10499 +       if (task->state & (__TASK_STOPPED | __TASK_TRACED))
10500 +               return true;
10501 +#ifdef CONFIG_PREEMPT_RT_FULL
10502 +       if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED))
10503 +               return true;
10504 +#endif
10505 +       return false;
10506 +}
10507 +
10508 +static inline bool task_is_stopped_or_traced(struct task_struct *task)
10509 +{
10510 +       bool traced_stopped;
10511 +
10512 +#ifdef CONFIG_PREEMPT_RT_FULL
10513 +       unsigned long flags;
10514 +
10515 +       raw_spin_lock_irqsave(&task->pi_lock, flags);
10516 +       traced_stopped = __task_is_stopped_or_traced(task);
10517 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
10518 +#else
10519 +       traced_stopped = __task_is_stopped_or_traced(task);
10520 +#endif
10521 +       return traced_stopped;
10522 +}
10523 +
10524 +static inline bool task_is_traced(struct task_struct *task)
10525 +{
10526 +       bool traced = false;
10527 +
10528 +       if (task->state & __TASK_TRACED)
10529 +               return true;
10530 +#ifdef CONFIG_PREEMPT_RT_FULL
10531 +       /* in case the task is sleeping on tasklist_lock */
10532 +       raw_spin_lock_irq(&task->pi_lock);
10533 +       if (task->state & __TASK_TRACED)
10534 +               traced = true;
10535 +       else if (task->saved_state & __TASK_TRACED)
10536 +               traced = true;
10537 +       raw_spin_unlock_irq(&task->pi_lock);
10538 +#endif
10539 +       return traced;
10540 +}
10541 +
10542  /*
10543   * cond_resched() and cond_resched_lock(): latency reduction via
10544   * explicit rescheduling in places that are safe. The return
10545 @@ -3220,12 +3372,16 @@ extern int __cond_resched_lock(spinlock_t *lock);
10546         __cond_resched_lock(lock);                              \
10547  })
10548
10549 +#ifndef CONFIG_PREEMPT_RT_FULL
10550  extern int __cond_resched_softirq(void);
10551
10552  #define cond_resched_softirq() ({                                      \
10553         ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET);     \
10554         __cond_resched_softirq();                                       \
10555  })
10556 +#else
10557 +# define cond_resched_softirq()                cond_resched()
10558 +#endif
10559
10560  static inline void cond_resched_rcu(void)
10561  {
10562 @@ -3387,6 +3543,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
10563
10564  #endif /* CONFIG_SMP */
10565
10566 +static inline int __migrate_disabled(struct task_struct *p)
10567 +{
10568 +#ifdef CONFIG_PREEMPT_RT_FULL
10569 +       return p->migrate_disable;
10570 +#else
10571 +       return 0;
10572 +#endif
10573 +}
10574 +
10575 +/* Future-safe accessor for struct task_struct's cpus_allowed. */
10576 +static inline const struct cpumask *tsk_cpus_allowed(struct task_struct *p)
10577 +{
10578 +       if (__migrate_disabled(p))
10579 +               return cpumask_of(task_cpu(p));
10580 +
10581 +       return &p->cpus_allowed;
10582 +}
10583 +
10584 +static inline int tsk_nr_cpus_allowed(struct task_struct *p)
10585 +{
10586 +       if (__migrate_disabled(p))
10587 +               return 1;
10588 +       return p->nr_cpus_allowed;
10589 +}
10590 +
10591  extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
10592  extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
10593
10594 diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
10595 index ead97654c4e9..3d7223ffdd3b 100644
10596 --- a/include/linux/seqlock.h
10597 +++ b/include/linux/seqlock.h
10598 @@ -220,20 +220,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
10599         return __read_seqcount_retry(s, start);
10600  }
10601
10602 -
10603 -
10604 -static inline void raw_write_seqcount_begin(seqcount_t *s)
10605 +static inline void __raw_write_seqcount_begin(seqcount_t *s)
10606  {
10607         s->sequence++;
10608         smp_wmb();
10609  }
10610
10611 -static inline void raw_write_seqcount_end(seqcount_t *s)
10612 +static inline void raw_write_seqcount_begin(seqcount_t *s)
10613 +{
10614 +       preempt_disable_rt();
10615 +       __raw_write_seqcount_begin(s);
10616 +}
10617 +
10618 +static inline void __raw_write_seqcount_end(seqcount_t *s)
10619  {
10620         smp_wmb();
10621         s->sequence++;
10622  }
10623
10624 +static inline void raw_write_seqcount_end(seqcount_t *s)
10625 +{
10626 +       __raw_write_seqcount_end(s);
10627 +       preempt_enable_rt();
10628 +}
10629 +
10630  /**
10631   * raw_write_seqcount_barrier - do a seq write barrier
10632   * @s: pointer to seqcount_t
10633 @@ -428,10 +438,32 @@ typedef struct {
10634  /*
10635   * Read side functions for starting and finalizing a read side section.
10636   */
10637 +#ifndef CONFIG_PREEMPT_RT_FULL
10638  static inline unsigned read_seqbegin(const seqlock_t *sl)
10639  {
10640         return read_seqcount_begin(&sl->seqcount);
10641  }
10642 +#else
10643 +/*
10644 + * Starvation safe read side for RT
10645 + */
10646 +static inline unsigned read_seqbegin(seqlock_t *sl)
10647 +{
10648 +       unsigned ret;
10649 +
10650 +repeat:
10651 +       ret = ACCESS_ONCE(sl->seqcount.sequence);
10652 +       if (unlikely(ret & 1)) {
10653 +               /*
10654 +                * Take the lock and let the writer proceed (i.e. evtl
10655 +                * boost it), otherwise we could loop here forever.
10656 +                */
10657 +               spin_unlock_wait(&sl->lock);
10658 +               goto repeat;
10659 +       }
10660 +       return ret;
10661 +}
10662 +#endif
10663
10664  static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
10665  {
10666 @@ -446,36 +478,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
10667  static inline void write_seqlock(seqlock_t *sl)
10668  {
10669         spin_lock(&sl->lock);
10670 -       write_seqcount_begin(&sl->seqcount);
10671 +       __raw_write_seqcount_begin(&sl->seqcount);
10672 +}
10673 +
10674 +static inline int try_write_seqlock(seqlock_t *sl)
10675 +{
10676 +       if (spin_trylock(&sl->lock)) {
10677 +               __raw_write_seqcount_begin(&sl->seqcount);
10678 +               return 1;
10679 +       }
10680 +       return 0;
10681  }
10682
10683  static inline void write_sequnlock(seqlock_t *sl)
10684  {
10685 -       write_seqcount_end(&sl->seqcount);
10686 +       __raw_write_seqcount_end(&sl->seqcount);
10687         spin_unlock(&sl->lock);
10688  }
10689
10690  static inline void write_seqlock_bh(seqlock_t *sl)
10691  {
10692         spin_lock_bh(&sl->lock);
10693 -       write_seqcount_begin(&sl->seqcount);
10694 +       __raw_write_seqcount_begin(&sl->seqcount);
10695  }
10696
10697  static inline void write_sequnlock_bh(seqlock_t *sl)
10698  {
10699 -       write_seqcount_end(&sl->seqcount);
10700 +       __raw_write_seqcount_end(&sl->seqcount);
10701         spin_unlock_bh(&sl->lock);
10702  }
10703
10704  static inline void write_seqlock_irq(seqlock_t *sl)
10705  {
10706         spin_lock_irq(&sl->lock);
10707 -       write_seqcount_begin(&sl->seqcount);
10708 +       __raw_write_seqcount_begin(&sl->seqcount);
10709  }
10710
10711  static inline void write_sequnlock_irq(seqlock_t *sl)
10712  {
10713 -       write_seqcount_end(&sl->seqcount);
10714 +       __raw_write_seqcount_end(&sl->seqcount);
10715         spin_unlock_irq(&sl->lock);
10716  }
10717
10718 @@ -484,7 +525,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
10719         unsigned long flags;
10720
10721         spin_lock_irqsave(&sl->lock, flags);
10722 -       write_seqcount_begin(&sl->seqcount);
10723 +       __raw_write_seqcount_begin(&sl->seqcount);
10724         return flags;
10725  }
10726
10727 @@ -494,7 +535,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
10728  static inline void
10729  write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
10730  {
10731 -       write_seqcount_end(&sl->seqcount);
10732 +       __raw_write_seqcount_end(&sl->seqcount);
10733         spin_unlock_irqrestore(&sl->lock, flags);
10734  }
10735
10736 diff --git a/include/linux/signal.h b/include/linux/signal.h
10737 index b63f63eaa39c..295540fdfc72 100644
10738 --- a/include/linux/signal.h
10739 +++ b/include/linux/signal.h
10740 @@ -233,6 +233,7 @@ static inline void init_sigpending(struct sigpending *sig)
10741  }
10742
10743  extern void flush_sigqueue(struct sigpending *queue);
10744 +extern void flush_task_sigqueue(struct task_struct *tsk);
10745
10746  /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
10747  static inline int valid_signal(unsigned long sig)
10748 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
10749 index 0f665cb26b50..59c38d1635c8 100644
10750 --- a/include/linux/skbuff.h
10751 +++ b/include/linux/skbuff.h
10752 @@ -284,6 +284,7 @@ struct sk_buff_head {
10753
10754         __u32           qlen;
10755         spinlock_t      lock;
10756 +       raw_spinlock_t  raw_lock;
10757  };
10758
10759  struct sk_buff;
10760 @@ -1565,6 +1566,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list)
10761         __skb_queue_head_init(list);
10762  }
10763
10764 +static inline void skb_queue_head_init_raw(struct sk_buff_head *list)
10765 +{
10766 +       raw_spin_lock_init(&list->raw_lock);
10767 +       __skb_queue_head_init(list);
10768 +}
10769 +
10770  static inline void skb_queue_head_init_class(struct sk_buff_head *list,
10771                 struct lock_class_key *class)
10772  {
10773 diff --git a/include/linux/smp.h b/include/linux/smp.h
10774 index eccae4690f41..64ec52d951c3 100644
10775 --- a/include/linux/smp.h
10776 +++ b/include/linux/smp.h
10777 @@ -185,6 +185,9 @@ static inline void smp_init(void) { }
10778  #define get_cpu()              ({ preempt_disable(); smp_processor_id(); })
10779  #define put_cpu()              preempt_enable()
10780
10781 +#define get_cpu_light()                ({ migrate_disable(); smp_processor_id(); })
10782 +#define put_cpu_light()                migrate_enable()
10783 +
10784  /*
10785   * Callback to arch code if there's nosmp or maxcpus=0 on the
10786   * boot command line:
10787 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
10788 index 47dd0cebd204..02928fa5499d 100644
10789 --- a/include/linux/spinlock.h
10790 +++ b/include/linux/spinlock.h
10791 @@ -271,7 +271,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
10792  #define raw_spin_can_lock(lock)        (!raw_spin_is_locked(lock))
10793
10794  /* Include rwlock functions */
10795 -#include <linux/rwlock.h>
10796 +#ifdef CONFIG_PREEMPT_RT_FULL
10797 +# include <linux/rwlock_rt.h>
10798 +#else
10799 +# include <linux/rwlock.h>
10800 +#endif
10801
10802  /*
10803   * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
10804 @@ -282,6 +286,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
10805  # include <linux/spinlock_api_up.h>
10806  #endif
10807
10808 +#ifdef CONFIG_PREEMPT_RT_FULL
10809 +# include <linux/spinlock_rt.h>
10810 +#else /* PREEMPT_RT_FULL */
10811 +
10812  /*
10813   * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
10814   */
10815 @@ -347,6 +355,12 @@ static __always_inline void spin_unlock(spinlock_t *lock)
10816         raw_spin_unlock(&lock->rlock);
10817  }
10818
10819 +static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
10820 +{
10821 +       raw_spin_unlock(&lock->rlock);
10822 +       return 0;
10823 +}
10824 +
10825  static __always_inline void spin_unlock_bh(spinlock_t *lock)
10826  {
10827         raw_spin_unlock_bh(&lock->rlock);
10828 @@ -416,4 +430,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
10829  #define atomic_dec_and_lock(atomic, lock) \
10830                 __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))
10831
10832 +#endif /* !PREEMPT_RT_FULL */
10833 +
10834  #endif /* __LINUX_SPINLOCK_H */
10835 diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
10836 index 5344268e6e62..043263f30e81 100644
10837 --- a/include/linux/spinlock_api_smp.h
10838 +++ b/include/linux/spinlock_api_smp.h
10839 @@ -189,6 +189,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock)
10840         return 0;
10841  }
10842
10843 -#include <linux/rwlock_api_smp.h>
10844 +#ifndef CONFIG_PREEMPT_RT_FULL
10845 +# include <linux/rwlock_api_smp.h>
10846 +#endif
10847
10848  #endif /* __LINUX_SPINLOCK_API_SMP_H */
10849 diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
10850 new file mode 100644
10851 index 000000000000..7eb87584e843
10852 --- /dev/null
10853 +++ b/include/linux/spinlock_rt.h
10854 @@ -0,0 +1,165 @@
10855 +#ifndef __LINUX_SPINLOCK_RT_H
10856 +#define __LINUX_SPINLOCK_RT_H
10857 +
10858 +#ifndef __LINUX_SPINLOCK_H
10859 +#error Do not include directly. Use spinlock.h
10860 +#endif
10861 +
10862 +#include <linux/bug.h>
10863 +
10864 +extern void
10865 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key);
10866 +
10867 +#define spin_lock_init(slock)                          \
10868 +do {                                                   \
10869 +       static struct lock_class_key __key;             \
10870 +                                                       \
10871 +       rt_mutex_init(&(slock)->lock);                  \
10872 +       __rt_spin_lock_init(slock, #slock, &__key);     \
10873 +} while (0)
10874 +
10875 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
10876 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
10877 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
10878 +
10879 +extern void __lockfunc rt_spin_lock(spinlock_t *lock);
10880 +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
10881 +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
10882 +extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
10883 +extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
10884 +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
10885 +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
10886 +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
10887 +extern int __lockfunc rt_spin_trylock(spinlock_t *lock);
10888 +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
10889 +
10890 +/*
10891 + * lockdep-less calls, for derived types like rwlock:
10892 + * (for trylock they can use rt_mutex_trylock() directly.
10893 + */
10894 +extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
10895 +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
10896 +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
10897 +extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
10898 +
10899 +#define spin_lock(lock)                        rt_spin_lock(lock)
10900 +
10901 +#define spin_lock_bh(lock)                     \
10902 +       do {                                    \
10903 +               local_bh_disable();             \
10904 +               rt_spin_lock(lock);             \
10905 +       } while (0)
10906 +
10907 +#define spin_lock_irq(lock)            spin_lock(lock)
10908 +
10909 +#define spin_do_trylock(lock)          __cond_lock(lock, rt_spin_trylock(lock))
10910 +
10911 +#define spin_trylock(lock)                     \
10912 +({                                             \
10913 +       int __locked;                           \
10914 +       __locked = spin_do_trylock(lock);       \
10915 +       __locked;                               \
10916 +})
10917 +
10918 +#ifdef CONFIG_LOCKDEP
10919 +# define spin_lock_nested(lock, subclass)              \
10920 +       do {                                            \
10921 +               rt_spin_lock_nested(lock, subclass);    \
10922 +       } while (0)
10923 +
10924 +#define spin_lock_bh_nested(lock, subclass)            \
10925 +       do {                                            \
10926 +               local_bh_disable();                     \
10927 +               rt_spin_lock_nested(lock, subclass);    \
10928 +       } while (0)
10929 +
10930 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
10931 +       do {                                             \
10932 +               typecheck(unsigned long, flags);         \
10933 +               flags = 0;                               \
10934 +               rt_spin_lock_nested(lock, subclass);     \
10935 +       } while (0)
10936 +#else
10937 +# define spin_lock_nested(lock, subclass)      spin_lock(lock)
10938 +# define spin_lock_bh_nested(lock, subclass)   spin_lock_bh(lock)
10939 +
10940 +# define spin_lock_irqsave_nested(lock, flags, subclass) \
10941 +       do {                                             \
10942 +               typecheck(unsigned long, flags);         \
10943 +               flags = 0;                               \
10944 +               spin_lock(lock);                         \
10945 +       } while (0)
10946 +#endif
10947 +
10948 +#define spin_lock_irqsave(lock, flags)                  \
10949 +       do {                                             \
10950 +               typecheck(unsigned long, flags);         \
10951 +               flags = 0;                               \
10952 +               spin_lock(lock);                         \
10953 +       } while (0)
10954 +
10955 +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
10956 +{
10957 +       unsigned long flags = 0;
10958 +#ifdef CONFIG_TRACE_IRQFLAGS
10959 +       flags = rt_spin_lock_trace_flags(lock);
10960 +#else
10961 +       spin_lock(lock); /* lock_local */
10962 +#endif
10963 +       return flags;
10964 +}
10965 +
10966 +/* FIXME: we need rt_spin_lock_nest_lock */
10967 +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
10968 +
10969 +#define spin_unlock(lock)                      rt_spin_unlock(lock)
10970 +#define spin_unlock_no_deboost(lock)           rt_spin_unlock_no_deboost(lock)
10971 +
10972 +#define spin_unlock_bh(lock)                           \
10973 +       do {                                            \
10974 +               rt_spin_unlock(lock);                   \
10975 +               local_bh_enable();                      \
10976 +       } while (0)
10977 +
10978 +#define spin_unlock_irq(lock)          spin_unlock(lock)
10979 +
10980 +#define spin_unlock_irqrestore(lock, flags)            \
10981 +       do {                                            \
10982 +               typecheck(unsigned long, flags);        \
10983 +               (void) flags;                           \
10984 +               spin_unlock(lock);                      \
10985 +       } while (0)
10986 +
10987 +#define spin_trylock_bh(lock)  __cond_lock(lock, rt_spin_trylock_bh(lock))
10988 +#define spin_trylock_irq(lock) spin_trylock(lock)
10989 +
10990 +#define spin_trylock_irqsave(lock, flags)      \
10991 +       rt_spin_trylock_irqsave(lock, &(flags))
10992 +
10993 +#define spin_unlock_wait(lock)         rt_spin_unlock_wait(lock)
10994 +
10995 +#ifdef CONFIG_GENERIC_LOCKBREAK
10996 +# define spin_is_contended(lock)       ((lock)->break_lock)
10997 +#else
10998 +# define spin_is_contended(lock)       (((void)(lock), 0))
10999 +#endif
11000 +
11001 +static inline int spin_can_lock(spinlock_t *lock)
11002 +{
11003 +       return !rt_mutex_is_locked(&lock->lock);
11004 +}
11005 +
11006 +static inline int spin_is_locked(spinlock_t *lock)
11007 +{
11008 +       return rt_mutex_is_locked(&lock->lock);
11009 +}
11010 +
11011 +static inline void assert_spin_locked(spinlock_t *lock)
11012 +{
11013 +       BUG_ON(!spin_is_locked(lock));
11014 +}
11015 +
11016 +#define atomic_dec_and_lock(atomic, lock) \
11017 +       atomic_dec_and_spin_lock(atomic, lock)
11018 +
11019 +#endif
11020 diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
11021 index 73548eb13a5d..10bac715ea96 100644
11022 --- a/include/linux/spinlock_types.h
11023 +++ b/include/linux/spinlock_types.h
11024 @@ -9,80 +9,15 @@
11025   * Released under the General Public License (GPL).
11026   */
11027
11028 -#if defined(CONFIG_SMP)
11029 -# include <asm/spinlock_types.h>
11030 +#include <linux/spinlock_types_raw.h>
11031 +
11032 +#ifndef CONFIG_PREEMPT_RT_FULL
11033 +# include <linux/spinlock_types_nort.h>
11034 +# include <linux/rwlock_types.h>
11035  #else
11036 -# include <linux/spinlock_types_up.h>
11037 +# include <linux/rtmutex.h>
11038 +# include <linux/spinlock_types_rt.h>
11039 +# include <linux/rwlock_types_rt.h>
11040  #endif
11041
11042 -#include <linux/lockdep.h>
11043 -
11044 -typedef struct raw_spinlock {
11045 -       arch_spinlock_t raw_lock;
11046 -#ifdef CONFIG_GENERIC_LOCKBREAK
11047 -       unsigned int break_lock;
11048 -#endif
11049 -#ifdef CONFIG_DEBUG_SPINLOCK
11050 -       unsigned int magic, owner_cpu;
11051 -       void *owner;
11052 -#endif
11053 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
11054 -       struct lockdep_map dep_map;
11055 -#endif
11056 -} raw_spinlock_t;
11057 -
11058 -#define SPINLOCK_MAGIC         0xdead4ead
11059 -
11060 -#define SPINLOCK_OWNER_INIT    ((void *)-1L)
11061 -
11062 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
11063 -# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
11064 -#else
11065 -# define SPIN_DEP_MAP_INIT(lockname)
11066 -#endif
11067 -
11068 -#ifdef CONFIG_DEBUG_SPINLOCK
11069 -# define SPIN_DEBUG_INIT(lockname)             \
11070 -       .magic = SPINLOCK_MAGIC,                \
11071 -       .owner_cpu = -1,                        \
11072 -       .owner = SPINLOCK_OWNER_INIT,
11073 -#else
11074 -# define SPIN_DEBUG_INIT(lockname)
11075 -#endif
11076 -
11077 -#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
11078 -       {                                       \
11079 -       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
11080 -       SPIN_DEBUG_INIT(lockname)               \
11081 -       SPIN_DEP_MAP_INIT(lockname) }
11082 -
11083 -#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
11084 -       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
11085 -
11086 -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
11087 -
11088 -typedef struct spinlock {
11089 -       union {
11090 -               struct raw_spinlock rlock;
11091 -
11092 -#ifdef CONFIG_DEBUG_LOCK_ALLOC
11093 -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
11094 -               struct {
11095 -                       u8 __padding[LOCK_PADSIZE];
11096 -                       struct lockdep_map dep_map;
11097 -               };
11098 -#endif
11099 -       };
11100 -} spinlock_t;
11101 -
11102 -#define __SPIN_LOCK_INITIALIZER(lockname) \
11103 -       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
11104 -
11105 -#define __SPIN_LOCK_UNLOCKED(lockname) \
11106 -       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
11107 -
11108 -#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
11109 -
11110 -#include <linux/rwlock_types.h>
11111 -
11112  #endif /* __LINUX_SPINLOCK_TYPES_H */
11113 diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h
11114 new file mode 100644
11115 index 000000000000..f1dac1fb1d6a
11116 --- /dev/null
11117 +++ b/include/linux/spinlock_types_nort.h
11118 @@ -0,0 +1,33 @@
11119 +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H
11120 +#define __LINUX_SPINLOCK_TYPES_NORT_H
11121 +
11122 +#ifndef __LINUX_SPINLOCK_TYPES_H
11123 +#error "Do not include directly. Include spinlock_types.h instead"
11124 +#endif
11125 +
11126 +/*
11127 + * The non RT version maps spinlocks to raw_spinlocks
11128 + */
11129 +typedef struct spinlock {
11130 +       union {
11131 +               struct raw_spinlock rlock;
11132 +
11133 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11134 +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
11135 +               struct {
11136 +                       u8 __padding[LOCK_PADSIZE];
11137 +                       struct lockdep_map dep_map;
11138 +               };
11139 +#endif
11140 +       };
11141 +} spinlock_t;
11142 +
11143 +#define __SPIN_LOCK_INITIALIZER(lockname) \
11144 +       { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } }
11145 +
11146 +#define __SPIN_LOCK_UNLOCKED(lockname) \
11147 +       (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname)
11148 +
11149 +#define DEFINE_SPINLOCK(x)     spinlock_t x = __SPIN_LOCK_UNLOCKED(x)
11150 +
11151 +#endif
11152 diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h
11153 new file mode 100644
11154 index 000000000000..edffc4d53fc9
11155 --- /dev/null
11156 +++ b/include/linux/spinlock_types_raw.h
11157 @@ -0,0 +1,56 @@
11158 +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H
11159 +#define __LINUX_SPINLOCK_TYPES_RAW_H
11160 +
11161 +#if defined(CONFIG_SMP)
11162 +# include <asm/spinlock_types.h>
11163 +#else
11164 +# include <linux/spinlock_types_up.h>
11165 +#endif
11166 +
11167 +#include <linux/lockdep.h>
11168 +
11169 +typedef struct raw_spinlock {
11170 +       arch_spinlock_t raw_lock;
11171 +#ifdef CONFIG_GENERIC_LOCKBREAK
11172 +       unsigned int break_lock;
11173 +#endif
11174 +#ifdef CONFIG_DEBUG_SPINLOCK
11175 +       unsigned int magic, owner_cpu;
11176 +       void *owner;
11177 +#endif
11178 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11179 +       struct lockdep_map dep_map;
11180 +#endif
11181 +} raw_spinlock_t;
11182 +
11183 +#define SPINLOCK_MAGIC         0xdead4ead
11184 +
11185 +#define SPINLOCK_OWNER_INIT    ((void *)-1L)
11186 +
11187 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11188 +# define SPIN_DEP_MAP_INIT(lockname)   .dep_map = { .name = #lockname }
11189 +#else
11190 +# define SPIN_DEP_MAP_INIT(lockname)
11191 +#endif
11192 +
11193 +#ifdef CONFIG_DEBUG_SPINLOCK
11194 +# define SPIN_DEBUG_INIT(lockname)             \
11195 +       .magic = SPINLOCK_MAGIC,                \
11196 +       .owner_cpu = -1,                        \
11197 +       .owner = SPINLOCK_OWNER_INIT,
11198 +#else
11199 +# define SPIN_DEBUG_INIT(lockname)
11200 +#endif
11201 +
11202 +#define __RAW_SPIN_LOCK_INITIALIZER(lockname)  \
11203 +       {                                       \
11204 +       .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,  \
11205 +       SPIN_DEBUG_INIT(lockname)               \
11206 +       SPIN_DEP_MAP_INIT(lockname) }
11207 +
11208 +#define __RAW_SPIN_LOCK_UNLOCKED(lockname)     \
11209 +       (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)
11210 +
11211 +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x)
11212 +
11213 +#endif
11214 diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h
11215 new file mode 100644
11216 index 000000000000..3e3d8c5f7a9a
11217 --- /dev/null
11218 +++ b/include/linux/spinlock_types_rt.h
11219 @@ -0,0 +1,48 @@
11220 +#ifndef __LINUX_SPINLOCK_TYPES_RT_H
11221 +#define __LINUX_SPINLOCK_TYPES_RT_H
11222 +
11223 +#ifndef __LINUX_SPINLOCK_TYPES_H
11224 +#error "Do not include directly. Include spinlock_types.h instead"
11225 +#endif
11226 +
11227 +#include <linux/cache.h>
11228 +
11229 +/*
11230 + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field:
11231 + */
11232 +typedef struct spinlock {
11233 +       struct rt_mutex         lock;
11234 +       unsigned int            break_lock;
11235 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
11236 +       struct lockdep_map      dep_map;
11237 +#endif
11238 +} spinlock_t;
11239 +
11240 +#ifdef CONFIG_DEBUG_RT_MUTEXES
11241 +# define __RT_SPIN_INITIALIZER(name) \
11242 +       { \
11243 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
11244 +       .save_state = 1, \
11245 +       .file = __FILE__, \
11246 +       .line = __LINE__ , \
11247 +       }
11248 +#else
11249 +# define __RT_SPIN_INITIALIZER(name) \
11250 +       {                                                               \
11251 +       .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),          \
11252 +       .save_state = 1, \
11253 +       }
11254 +#endif
11255 +
11256 +/*
11257 +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock)
11258 +*/
11259 +
11260 +#define __SPIN_LOCK_UNLOCKED(name)                     \
11261 +       { .lock = __RT_SPIN_INITIALIZER(name.lock),             \
11262 +         SPIN_DEP_MAP_INIT(name) }
11263 +
11264 +#define DEFINE_SPINLOCK(name) \
11265 +       spinlock_t name = __SPIN_LOCK_UNLOCKED(name)
11266 +
11267 +#endif
11268 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
11269 index dc8eb63c6568..e793d3a257da 100644
11270 --- a/include/linux/srcu.h
11271 +++ b/include/linux/srcu.h
11272 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
11273
11274  void process_srcu(struct work_struct *work);
11275
11276 -#define __SRCU_STRUCT_INIT(name)                                       \
11277 +#define __SRCU_STRUCT_INIT(name, pcpu_name)                            \
11278         {                                                               \
11279                 .completed = -300,                                      \
11280 -               .per_cpu_ref = &name##_srcu_array,                      \
11281 +               .per_cpu_ref = &pcpu_name,                              \
11282                 .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),    \
11283                 .running = false,                                       \
11284                 .batch_queue = RCU_BATCH_INIT(name.batch_queue),        \
11285 @@ -119,7 +119,7 @@ void process_srcu(struct work_struct *work);
11286   */
11287  #define __DEFINE_SRCU(name, is_static)                                 \
11288         static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
11289 -       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
11290 +       is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_array)
11291  #define DEFINE_SRCU(name)              __DEFINE_SRCU(name, /* not static */)
11292  #define DEFINE_STATIC_SRCU(name)       __DEFINE_SRCU(name, static)
11293
11294 diff --git a/include/linux/suspend.h b/include/linux/suspend.h
11295 index 7693e39b14fe..b36eedeb28d1 100644
11296 --- a/include/linux/suspend.h
11297 +++ b/include/linux/suspend.h
11298 @@ -193,6 +193,12 @@ struct platform_freeze_ops {
11299         void (*end)(void);
11300  };
11301
11302 +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION)
11303 +extern bool pm_in_action;
11304 +#else
11305 +# define pm_in_action false
11306 +#endif
11307 +
11308  #ifdef CONFIG_SUSPEND
11309  /**
11310   * suspend_set_ops - set platform dependent suspend operations
11311 diff --git a/include/linux/swait.h b/include/linux/swait.h
11312 index c1f9c62a8a50..83f004a72320 100644
11313 --- a/include/linux/swait.h
11314 +++ b/include/linux/swait.h
11315 @@ -87,6 +87,7 @@ static inline int swait_active(struct swait_queue_head *q)
11316  extern void swake_up(struct swait_queue_head *q);
11317  extern void swake_up_all(struct swait_queue_head *q);
11318  extern void swake_up_locked(struct swait_queue_head *q);
11319 +extern void swake_up_all_locked(struct swait_queue_head *q);
11320
11321  extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
11322  extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
11323 diff --git a/include/linux/swap.h b/include/linux/swap.h
11324 index e1d761463243..4ae9a4434ad3 100644
11325 --- a/include/linux/swap.h
11326 +++ b/include/linux/swap.h
11327 @@ -11,6 +11,7 @@
11328  #include <linux/fs.h>
11329  #include <linux/atomic.h>
11330  #include <linux/page-flags.h>
11331 +#include <linux/locallock.h>
11332  #include <asm/page.h>
11333
11334  struct notifier_block;
11335 @@ -243,7 +244,8 @@ struct swap_info_struct {
11336  void *workingset_eviction(struct address_space *mapping, struct page *page);
11337  bool workingset_refault(void *shadow);
11338  void workingset_activation(struct page *page);
11339 -extern struct list_lru workingset_shadow_nodes;
11340 +extern struct list_lru __workingset_shadow_nodes;
11341 +DECLARE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
11342
11343  static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
11344  {
11345 @@ -288,6 +290,7 @@ extern unsigned long nr_free_pagecache_pages(void);
11346
11347
11348  /* linux/mm/swap.c */
11349 +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock);
11350  extern void lru_cache_add(struct page *);
11351  extern void lru_cache_add_anon(struct page *page);
11352  extern void lru_cache_add_file(struct page *page);
11353 diff --git a/include/linux/swork.h b/include/linux/swork.h
11354 new file mode 100644
11355 index 000000000000..f175fa9a6016
11356 --- /dev/null
11357 +++ b/include/linux/swork.h
11358 @@ -0,0 +1,24 @@
11359 +#ifndef _LINUX_SWORK_H
11360 +#define _LINUX_SWORK_H
11361 +
11362 +#include <linux/list.h>
11363 +
11364 +struct swork_event {
11365 +       struct list_head item;
11366 +       unsigned long flags;
11367 +       void (*func)(struct swork_event *);
11368 +};
11369 +
11370 +static inline void INIT_SWORK(struct swork_event *event,
11371 +                             void (*func)(struct swork_event *))
11372 +{
11373 +       event->flags = 0;
11374 +       event->func = func;
11375 +}
11376 +
11377 +bool swork_queue(struct swork_event *sev);
11378 +
11379 +int swork_get(void);
11380 +void swork_put(void);
11381 +
11382 +#endif /* _LINUX_SWORK_H */
11383 diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
11384 index 2b5b10eed74f..8bf15b1858f5 100644
11385 --- a/include/linux/thread_info.h
11386 +++ b/include/linux/thread_info.h
11387 @@ -103,7 +103,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
11388  #define test_thread_flag(flag) \
11389         test_ti_thread_flag(current_thread_info(), flag)
11390
11391 -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
11392 +#ifdef CONFIG_PREEMPT_LAZY
11393 +#define tif_need_resched()     (test_thread_flag(TIF_NEED_RESCHED) || \
11394 +                                test_thread_flag(TIF_NEED_RESCHED_LAZY))
11395 +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED))
11396 +#define tif_need_resched_lazy()        test_thread_flag(TIF_NEED_RESCHED_LAZY))
11397 +
11398 +#else
11399 +#define tif_need_resched()     test_thread_flag(TIF_NEED_RESCHED)
11400 +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED)
11401 +#define tif_need_resched_lazy()        0
11402 +#endif
11403
11404  #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
11405  static inline int arch_within_stack_frames(const void * const stack,
11406 diff --git a/include/linux/timer.h b/include/linux/timer.h
11407 index 51d601f192d4..83cea629efe1 100644
11408 --- a/include/linux/timer.h
11409 +++ b/include/linux/timer.h
11410 @@ -241,7 +241,7 @@ extern void add_timer(struct timer_list *timer);
11411
11412  extern int try_to_del_timer_sync(struct timer_list *timer);
11413
11414 -#ifdef CONFIG_SMP
11415 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
11416    extern int del_timer_sync(struct timer_list *timer);
11417  #else
11418  # define del_timer_sync(t)             del_timer(t)
11419 diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
11420 index be007610ceb0..15154b13a53b 100644
11421 --- a/include/linux/trace_events.h
11422 +++ b/include/linux/trace_events.h
11423 @@ -56,6 +56,9 @@ struct trace_entry {
11424         unsigned char           flags;
11425         unsigned char           preempt_count;
11426         int                     pid;
11427 +       unsigned short          migrate_disable;
11428 +       unsigned short          padding;
11429 +       unsigned char           preempt_lazy_count;
11430  };
11431
11432  #define TRACE_EVENT_TYPE_MAX                                           \
11433 diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
11434 index f30c187ed785..83bf0f798426 100644
11435 --- a/include/linux/uaccess.h
11436 +++ b/include/linux/uaccess.h
11437 @@ -24,6 +24,7 @@ static __always_inline void pagefault_disabled_dec(void)
11438   */
11439  static inline void pagefault_disable(void)
11440  {
11441 +       migrate_disable();
11442         pagefault_disabled_inc();
11443         /*
11444          * make sure to have issued the store before a pagefault
11445 @@ -40,6 +41,7 @@ static inline void pagefault_enable(void)
11446          */
11447         barrier();
11448         pagefault_disabled_dec();
11449 +       migrate_enable();
11450  }
11451
11452  /*
11453 diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
11454 index 4a29c75b146e..0a294e950df8 100644
11455 --- a/include/linux/uprobes.h
11456 +++ b/include/linux/uprobes.h
11457 @@ -27,6 +27,7 @@
11458  #include <linux/errno.h>
11459  #include <linux/rbtree.h>
11460  #include <linux/types.h>
11461 +#include <linux/wait.h>
11462
11463  struct vm_area_struct;
11464  struct mm_struct;
11465 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
11466 index 613771909b6e..e28c5a43229d 100644
11467 --- a/include/linux/vmstat.h
11468 +++ b/include/linux/vmstat.h
11469 @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states);
11470   */
11471  static inline void __count_vm_event(enum vm_event_item item)
11472  {
11473 +       preempt_disable_rt();
11474         raw_cpu_inc(vm_event_states.event[item]);
11475 +       preempt_enable_rt();
11476  }
11477
11478  static inline void count_vm_event(enum vm_event_item item)
11479 @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item)
11480
11481  static inline void __count_vm_events(enum vm_event_item item, long delta)
11482  {
11483 +       preempt_disable_rt();
11484         raw_cpu_add(vm_event_states.event[item], delta);
11485 +       preempt_enable_rt();
11486  }
11487
11488  static inline void count_vm_events(enum vm_event_item item, long delta)
11489 diff --git a/include/linux/wait.h b/include/linux/wait.h
11490 index c3ff74d764fa..60222150a409 100644
11491 --- a/include/linux/wait.h
11492 +++ b/include/linux/wait.h
11493 @@ -8,6 +8,7 @@
11494  #include <linux/spinlock.h>
11495  #include <asm/current.h>
11496  #include <uapi/linux/wait.h>
11497 +#include <linux/atomic.h>
11498
11499  typedef struct __wait_queue wait_queue_t;
11500  typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
11501 diff --git a/include/net/dst.h b/include/net/dst.h
11502 index 6835d224d47b..55a5a9698f14 100644
11503 --- a/include/net/dst.h
11504 +++ b/include/net/dst.h
11505 @@ -446,7 +446,7 @@ static inline void dst_confirm(struct dst_entry *dst)
11506  static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
11507                                    struct sk_buff *skb)
11508  {
11509 -       const struct hh_cache *hh;
11510 +       struct hh_cache *hh;
11511
11512         if (dst->pending_confirm) {
11513                 unsigned long now = jiffies;
11514 diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
11515 index 231e121cc7d9..d125222b979d 100644
11516 --- a/include/net/gen_stats.h
11517 +++ b/include/net/gen_stats.h
11518 @@ -5,6 +5,7 @@
11519  #include <linux/socket.h>
11520  #include <linux/rtnetlink.h>
11521  #include <linux/pkt_sched.h>
11522 +#include <net/net_seq_lock.h>
11523
11524  struct gnet_stats_basic_cpu {
11525         struct gnet_stats_basic_packed bstats;
11526 @@ -33,11 +34,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
11527                                  spinlock_t *lock, struct gnet_dump *d,
11528                                  int padattr);
11529
11530 -int gnet_stats_copy_basic(const seqcount_t *running,
11531 +int gnet_stats_copy_basic(net_seqlock_t *running,
11532                           struct gnet_dump *d,
11533                           struct gnet_stats_basic_cpu __percpu *cpu,
11534                           struct gnet_stats_basic_packed *b);
11535 -void __gnet_stats_copy_basic(const seqcount_t *running,
11536 +void __gnet_stats_copy_basic(net_seqlock_t *running,
11537                              struct gnet_stats_basic_packed *bstats,
11538                              struct gnet_stats_basic_cpu __percpu *cpu,
11539                              struct gnet_stats_basic_packed *b);
11540 @@ -55,14 +56,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
11541                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
11542                       struct gnet_stats_rate_est64 *rate_est,
11543                       spinlock_t *stats_lock,
11544 -                     seqcount_t *running, struct nlattr *opt);
11545 +                     net_seqlock_t *running, struct nlattr *opt);
11546  void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
11547                         struct gnet_stats_rate_est64 *rate_est);
11548  int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
11549                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
11550                           struct gnet_stats_rate_est64 *rate_est,
11551                           spinlock_t *stats_lock,
11552 -                         seqcount_t *running, struct nlattr *opt);
11553 +                         net_seqlock_t *running, struct nlattr *opt);
11554  bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
11555                           const struct gnet_stats_rate_est64 *rate_est);
11556  #endif
11557 diff --git a/include/net/neighbour.h b/include/net/neighbour.h
11558 index 8b683841e574..bf656008f6e7 100644
11559 --- a/include/net/neighbour.h
11560 +++ b/include/net/neighbour.h
11561 @@ -446,7 +446,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
11562  }
11563  #endif
11564
11565 -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
11566 +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
11567  {
11568         unsigned int seq;
11569         int hh_len;
11570 @@ -501,7 +501,7 @@ struct neighbour_cb {
11571
11572  #define NEIGH_CB(skb)  ((struct neighbour_cb *)(skb)->cb)
11573
11574 -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
11575 +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
11576                                      const struct net_device *dev)
11577  {
11578         unsigned int seq;
11579 diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h
11580 new file mode 100644
11581 index 000000000000..a7034298a82a
11582 --- /dev/null
11583 +++ b/include/net/net_seq_lock.h
11584 @@ -0,0 +1,15 @@
11585 +#ifndef __NET_NET_SEQ_LOCK_H__
11586 +#define __NET_NET_SEQ_LOCK_H__
11587 +
11588 +#ifdef CONFIG_PREEMPT_RT_BASE
11589 +# define net_seqlock_t                 seqlock_t
11590 +# define net_seq_begin(__r)            read_seqbegin(__r)
11591 +# define net_seq_retry(__r, __s)       read_seqretry(__r, __s)
11592 +
11593 +#else
11594 +# define net_seqlock_t                 seqcount_t
11595 +# define net_seq_begin(__r)            read_seqcount_begin(__r)
11596 +# define net_seq_retry(__r, __s)       read_seqcount_retry(__r, __s)
11597 +#endif
11598 +
11599 +#endif
11600 diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
11601 index d061ffeb1e71..12ef433dc3b8 100644
11602 --- a/include/net/netns/ipv4.h
11603 +++ b/include/net/netns/ipv4.h
11604 @@ -70,6 +70,7 @@ struct netns_ipv4 {
11605
11606         int sysctl_icmp_echo_ignore_all;
11607         int sysctl_icmp_echo_ignore_broadcasts;
11608 +       int sysctl_icmp_echo_sysrq;
11609         int sysctl_icmp_ignore_bogus_error_responses;
11610         int sysctl_icmp_ratelimit;
11611         int sysctl_icmp_ratemask;
11612 diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
11613 index 909aff2db2b3..c47219d6e4bc 100644
11614 --- a/include/net/sch_generic.h
11615 +++ b/include/net/sch_generic.h
11616 @@ -10,6 +10,7 @@
11617  #include <linux/dynamic_queue_limits.h>
11618  #include <net/gen_stats.h>
11619  #include <net/rtnetlink.h>
11620 +#include <net/net_seq_lock.h>
11621
11622  struct Qdisc_ops;
11623  struct qdisc_walker;
11624 @@ -78,7 +79,7 @@ struct Qdisc {
11625         struct sk_buff          *gso_skb ____cacheline_aligned_in_smp;
11626         struct sk_buff_head     q;
11627         struct gnet_stats_basic_packed bstats;
11628 -       seqcount_t              running;
11629 +       net_seqlock_t           running;
11630         struct gnet_stats_queue qstats;
11631         unsigned long           state;
11632         struct Qdisc            *next_sched;
11633 @@ -90,13 +91,22 @@ struct Qdisc {
11634         spinlock_t              busylock ____cacheline_aligned_in_smp;
11635  };
11636
11637 -static inline bool qdisc_is_running(const struct Qdisc *qdisc)
11638 +static inline bool qdisc_is_running(struct Qdisc *qdisc)
11639  {
11640 +#ifdef CONFIG_PREEMPT_RT_BASE
11641 +       return spin_is_locked(&qdisc->running.lock) ? true : false;
11642 +#else
11643         return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
11644 +#endif
11645  }
11646
11647  static inline bool qdisc_run_begin(struct Qdisc *qdisc)
11648  {
11649 +#ifdef CONFIG_PREEMPT_RT_BASE
11650 +       if (try_write_seqlock(&qdisc->running))
11651 +               return true;
11652 +       return false;
11653 +#else
11654         if (qdisc_is_running(qdisc))
11655                 return false;
11656         /* Variant of write_seqcount_begin() telling lockdep a trylock
11657 @@ -105,11 +115,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
11658         raw_write_seqcount_begin(&qdisc->running);
11659         seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
11660         return true;
11661 +#endif
11662  }
11663
11664  static inline void qdisc_run_end(struct Qdisc *qdisc)
11665  {
11666 +#ifdef CONFIG_PREEMPT_RT_BASE
11667 +       write_sequnlock(&qdisc->running);
11668 +#else
11669         write_seqcount_end(&qdisc->running);
11670 +#endif
11671  }
11672
11673  static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
11674 @@ -300,7 +315,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
11675         return qdisc_lock(root);
11676  }
11677
11678 -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
11679 +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
11680  {
11681         struct Qdisc *root = qdisc_root_sleeping(qdisc);
11682
11683 diff --git a/include/trace/events/hist.h b/include/trace/events/hist.h
11684 new file mode 100644
11685 index 000000000000..f7710de1b1f3
11686 --- /dev/null
11687 +++ b/include/trace/events/hist.h
11688 @@ -0,0 +1,73 @@
11689 +#undef TRACE_SYSTEM
11690 +#define TRACE_SYSTEM hist
11691 +
11692 +#if !defined(_TRACE_HIST_H) || defined(TRACE_HEADER_MULTI_READ)
11693 +#define _TRACE_HIST_H
11694 +
11695 +#include "latency_hist.h"
11696 +#include <linux/tracepoint.h>
11697 +
11698 +#if !defined(CONFIG_PREEMPT_OFF_HIST) && !defined(CONFIG_INTERRUPT_OFF_HIST)
11699 +#define trace_preemptirqsoff_hist(a, b)
11700 +#define trace_preemptirqsoff_hist_rcuidle(a, b)
11701 +#else
11702 +TRACE_EVENT(preemptirqsoff_hist,
11703 +
11704 +       TP_PROTO(int reason, int starthist),
11705 +
11706 +       TP_ARGS(reason, starthist),
11707 +
11708 +       TP_STRUCT__entry(
11709 +               __field(int,    reason)
11710 +               __field(int,    starthist)
11711 +       ),
11712 +
11713 +       TP_fast_assign(
11714 +               __entry->reason         = reason;
11715 +               __entry->starthist      = starthist;
11716 +       ),
11717 +
11718 +       TP_printk("reason=%s starthist=%s", getaction(__entry->reason),
11719 +                 __entry->starthist ? "start" : "stop")
11720 +);
11721 +#endif
11722 +
11723 +#ifndef CONFIG_MISSED_TIMER_OFFSETS_HIST
11724 +#define trace_hrtimer_interrupt(a, b, c, d)
11725 +#else
11726 +TRACE_EVENT(hrtimer_interrupt,
11727 +
11728 +       TP_PROTO(int cpu, long long offset, struct task_struct *curr,
11729 +               struct task_struct *task),
11730 +
11731 +       TP_ARGS(cpu, offset, curr, task),
11732 +
11733 +       TP_STRUCT__entry(
11734 +               __field(int,            cpu)
11735 +               __field(long long,      offset)
11736 +               __array(char,           ccomm,  TASK_COMM_LEN)
11737 +               __field(int,            cprio)
11738 +               __array(char,           tcomm,  TASK_COMM_LEN)
11739 +               __field(int,            tprio)
11740 +       ),
11741 +
11742 +       TP_fast_assign(
11743 +               __entry->cpu    = cpu;
11744 +               __entry->offset = offset;
11745 +               memcpy(__entry->ccomm, curr->comm, TASK_COMM_LEN);
11746 +               __entry->cprio  = curr->prio;
11747 +               memcpy(__entry->tcomm, task != NULL ? task->comm : "<none>",
11748 +                       task != NULL ? TASK_COMM_LEN : 7);
11749 +               __entry->tprio  = task != NULL ? task->prio : -1;
11750 +       ),
11751 +
11752 +       TP_printk("cpu=%d offset=%lld curr=%s[%d] thread=%s[%d]",
11753 +               __entry->cpu, __entry->offset, __entry->ccomm,
11754 +               __entry->cprio, __entry->tcomm, __entry->tprio)
11755 +);
11756 +#endif
11757 +
11758 +#endif /* _TRACE_HIST_H */
11759 +
11760 +/* This part must be outside protection */
11761 +#include <trace/define_trace.h>
11762 diff --git a/include/trace/events/latency_hist.h b/include/trace/events/latency_hist.h
11763 new file mode 100644
11764 index 000000000000..d3f2fbd560b1
11765 --- /dev/null
11766 +++ b/include/trace/events/latency_hist.h
11767 @@ -0,0 +1,29 @@
11768 +#ifndef _LATENCY_HIST_H
11769 +#define _LATENCY_HIST_H
11770 +
11771 +enum hist_action {
11772 +       IRQS_ON,
11773 +       PREEMPT_ON,
11774 +       TRACE_STOP,
11775 +       IRQS_OFF,
11776 +       PREEMPT_OFF,
11777 +       TRACE_START,
11778 +};
11779 +
11780 +static char *actions[] = {
11781 +       "IRQS_ON",
11782 +       "PREEMPT_ON",
11783 +       "TRACE_STOP",
11784 +       "IRQS_OFF",
11785 +       "PREEMPT_OFF",
11786 +       "TRACE_START",
11787 +};
11788 +
11789 +static inline char *getaction(int action)
11790 +{
11791 +       if (action >= 0 && action <= sizeof(actions)/sizeof(actions[0]))
11792 +               return actions[action];
11793 +       return "unknown";
11794 +}
11795 +
11796 +#endif /* _LATENCY_HIST_H */
11797 diff --git a/init/Kconfig b/init/Kconfig
11798 index cac3f096050d..b6c9166d878a 100644
11799 --- a/init/Kconfig
11800 +++ b/init/Kconfig
11801 @@ -496,7 +496,7 @@ config TINY_RCU
11802
11803  config RCU_EXPERT
11804         bool "Make expert-level adjustments to RCU configuration"
11805 -       default n
11806 +       default y if PREEMPT_RT_FULL
11807         help
11808           This option needs to be enabled if you wish to make
11809           expert-level adjustments to RCU configuration.  By default,
11810 @@ -613,7 +613,7 @@ config RCU_FANOUT_LEAF
11811
11812  config RCU_FAST_NO_HZ
11813         bool "Accelerate last non-dyntick-idle CPU's grace periods"
11814 -       depends on NO_HZ_COMMON && SMP && RCU_EXPERT
11815 +       depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL
11816         default n
11817         help
11818           This option permits CPUs to enter dynticks-idle state even if
11819 @@ -640,7 +640,7 @@ config TREE_RCU_TRACE
11820  config RCU_BOOST
11821         bool "Enable RCU priority boosting"
11822         depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
11823 -       default n
11824 +       default y if PREEMPT_RT_FULL
11825         help
11826           This option boosts the priority of preempted RCU readers that
11827           block the current preemptible RCU grace period for too long.
11828 @@ -1054,6 +1054,7 @@ config CFS_BANDWIDTH
11829  config RT_GROUP_SCHED
11830         bool "Group scheduling for SCHED_RR/FIFO"
11831         depends on CGROUP_SCHED
11832 +       depends on !PREEMPT_RT_FULL
11833         default n
11834         help
11835           This feature lets you explicitly allocate real CPU bandwidth
11836 @@ -1761,6 +1762,7 @@ choice
11837
11838  config SLAB
11839         bool "SLAB"
11840 +       depends on !PREEMPT_RT_FULL
11841         select HAVE_HARDENED_USERCOPY_ALLOCATOR
11842         help
11843           The regular slab allocator that is established and known to work
11844 @@ -1781,6 +1783,7 @@ config SLUB
11845  config SLOB
11846         depends on EXPERT
11847         bool "SLOB (Simple Allocator)"
11848 +       depends on !PREEMPT_RT_FULL
11849         help
11850            SLOB replaces the stock allocator with a drastically simpler
11851            allocator. SLOB is generally more space efficient but
11852 @@ -1799,7 +1802,7 @@ config SLAB_FREELIST_RANDOM
11853
11854  config SLUB_CPU_PARTIAL
11855         default y
11856 -       depends on SLUB && SMP
11857 +       depends on SLUB && SMP && !PREEMPT_RT_FULL
11858         bool "SLUB per cpu partial cache"
11859         help
11860           Per cpu partial caches accellerate objects allocation and freeing
11861 diff --git a/init/Makefile b/init/Makefile
11862 index 7bc47ee31c36..88cf473554e0 100644
11863 --- a/init/Makefile
11864 +++ b/init/Makefile
11865 @@ -33,4 +33,4 @@ $(obj)/version.o: include/generated/compile.h
11866  include/generated/compile.h: FORCE
11867         @$($(quiet)chk_compile.h)
11868         $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
11869 -       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)"
11870 +       "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)"
11871 diff --git a/init/main.c b/init/main.c
11872 index a8a58e2794a5..e4c979e37a91 100644
11873 --- a/init/main.c
11874 +++ b/init/main.c
11875 @@ -507,6 +507,7 @@ asmlinkage __visible void __init start_kernel(void)
11876         setup_command_line(command_line);
11877         setup_nr_cpu_ids();
11878         setup_per_cpu_areas();
11879 +       softirq_early_init();
11880         boot_cpu_state_init();
11881         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
11882
11883 diff --git a/ipc/msg.c b/ipc/msg.c
11884 index c6521c205cb4..996d89023552 100644
11885 --- a/ipc/msg.c
11886 +++ b/ipc/msg.c
11887 @@ -183,20 +183,14 @@ static void ss_wakeup(struct list_head *h, int kill)
11888         }
11889  }
11890
11891 -static void expunge_all(struct msg_queue *msq, int res)
11892 +static void expunge_all(struct msg_queue *msq, int res,
11893 +                       struct wake_q_head *wake_q)
11894  {
11895         struct msg_receiver *msr, *t;
11896
11897         list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {
11898 -               msr->r_msg = NULL; /* initialize expunge ordering */
11899 -               wake_up_process(msr->r_tsk);
11900 -               /*
11901 -                * Ensure that the wakeup is visible before setting r_msg as
11902 -                * the receiving end depends on it: either spinning on a nil,
11903 -                * or dealing with -EAGAIN cases. See lockless receive part 1
11904 -                * and 2 in do_msgrcv().
11905 -                */
11906 -               smp_wmb(); /* barrier (B) */
11907 +
11908 +               wake_q_add(wake_q, msr->r_tsk);
11909                 msr->r_msg = ERR_PTR(res);
11910         }
11911  }
11912 @@ -213,11 +207,13 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
11913  {
11914         struct msg_msg *msg, *t;
11915         struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
11916 +       WAKE_Q(wake_q);
11917
11918 -       expunge_all(msq, -EIDRM);
11919 +       expunge_all(msq, -EIDRM, &wake_q);
11920         ss_wakeup(&msq->q_senders, 1);
11921         msg_rmid(ns, msq);
11922         ipc_unlock_object(&msq->q_perm);
11923 +       wake_up_q(&wake_q);
11924         rcu_read_unlock();
11925
11926         list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
11927 @@ -342,6 +338,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
11928         struct kern_ipc_perm *ipcp;
11929         struct msqid64_ds uninitialized_var(msqid64);
11930         struct msg_queue *msq;
11931 +       WAKE_Q(wake_q);
11932         int err;
11933
11934         if (cmd == IPC_SET) {
11935 @@ -389,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
11936                 /* sleeping receivers might be excluded by
11937                  * stricter permissions.
11938                  */
11939 -               expunge_all(msq, -EAGAIN);
11940 +               expunge_all(msq, -EAGAIN, &wake_q);
11941                 /* sleeping senders might be able to send
11942                  * due to a larger queue size.
11943                  */
11944 @@ -402,6 +399,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
11945
11946  out_unlock0:
11947         ipc_unlock_object(&msq->q_perm);
11948 +       wake_up_q(&wake_q);
11949  out_unlock1:
11950         rcu_read_unlock();
11951  out_up:
11952 @@ -566,7 +564,8 @@ static int testmsg(struct msg_msg *msg, long type, int mode)
11953         return 0;
11954  }
11955
11956 -static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
11957 +static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg,
11958 +                                struct wake_q_head *wake_q)
11959  {
11960         struct msg_receiver *msr, *t;
11961
11962 @@ -577,27 +576,13 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)
11963
11964                         list_del(&msr->r_list);
11965                         if (msr->r_maxsize < msg->m_ts) {
11966 -                               /* initialize pipelined send ordering */
11967 -                               msr->r_msg = NULL;
11968 -                               wake_up_process(msr->r_tsk);
11969 -                               /* barrier (B) see barrier comment below */
11970 -                               smp_wmb();
11971 +                               wake_q_add(wake_q, msr->r_tsk);
11972                                 msr->r_msg = ERR_PTR(-E2BIG);
11973                         } else {
11974 -                               msr->r_msg = NULL;
11975                                 msq->q_lrpid = task_pid_vnr(msr->r_tsk);
11976                                 msq->q_rtime = get_seconds();
11977 -                               wake_up_process(msr->r_tsk);
11978 -                               /*
11979 -                                * Ensure that the wakeup is visible before
11980 -                                * setting r_msg, as the receiving can otherwise
11981 -                                * exit - once r_msg is set, the receiver can
11982 -                                * continue. See lockless receive part 1 and 2
11983 -                                * in do_msgrcv(). Barrier (B).
11984 -                                */
11985 -                               smp_wmb();
11986 +                               wake_q_add(wake_q, msr->r_tsk);
11987                                 msr->r_msg = msg;
11988 -
11989                                 return 1;
11990                         }
11991                 }
11992 @@ -613,6 +598,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
11993         struct msg_msg *msg;
11994         int err;
11995         struct ipc_namespace *ns;
11996 +       WAKE_Q(wake_q);
11997
11998         ns = current->nsproxy->ipc_ns;
11999
12000 @@ -698,7 +684,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
12001         msq->q_lspid = task_tgid_vnr(current);
12002         msq->q_stime = get_seconds();
12003
12004 -       if (!pipelined_send(msq, msg)) {
12005 +       if (!pipelined_send(msq, msg, &wake_q)) {
12006                 /* no one is waiting for this message, enqueue it */
12007                 list_add_tail(&msg->m_list, &msq->q_messages);
12008                 msq->q_cbytes += msgsz;
12009 @@ -712,6 +698,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
12010
12011  out_unlock0:
12012         ipc_unlock_object(&msq->q_perm);
12013 +       wake_up_q(&wake_q);
12014  out_unlock1:
12015         rcu_read_unlock();
12016         if (msg != NULL)
12017 @@ -932,57 +919,25 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
12018                 rcu_read_lock();
12019
12020                 /* Lockless receive, part 2:
12021 -                * Wait until pipelined_send or expunge_all are outside of
12022 -                * wake_up_process(). There is a race with exit(), see
12023 -                * ipc/mqueue.c for the details. The correct serialization
12024 -                * ensures that a receiver cannot continue without the wakeup
12025 -                * being visibible _before_ setting r_msg:
12026 +                * The work in pipelined_send() and expunge_all():
12027 +                * - Set pointer to message
12028 +                * - Queue the receiver task for later wakeup
12029 +                * - Wake up the process after the lock is dropped.
12030                  *
12031 -                * CPU 0                             CPU 1
12032 -                * <loop receiver>
12033 -                *   smp_rmb(); (A) <-- pair -.      <waker thread>
12034 -                *   <load ->r_msg>           |        msr->r_msg = NULL;
12035 -                *                            |        wake_up_process();
12036 -                * <continue>                 `------> smp_wmb(); (B)
12037 -                *                                     msr->r_msg = msg;
12038 -                *
12039 -                * Where (A) orders the message value read and where (B) orders
12040 -                * the write to the r_msg -- done in both pipelined_send and
12041 -                * expunge_all.
12042 +                * Should the process wake up before this wakeup (due to a
12043 +                * signal) it will either see the message and continue …
12044                  */
12045 -               for (;;) {
12046 -                       /*
12047 -                        * Pairs with writer barrier in pipelined_send
12048 -                        * or expunge_all.
12049 -                        */
12050 -                       smp_rmb(); /* barrier (A) */
12051 -                       msg = (struct msg_msg *)msr_d.r_msg;
12052 -                       if (msg)
12053 -                               break;
12054
12055 -                       /*
12056 -                        * The cpu_relax() call is a compiler barrier
12057 -                        * which forces everything in this loop to be
12058 -                        * re-loaded.
12059 -                        */
12060 -                       cpu_relax();
12061 -               }
12062 -
12063 -               /* Lockless receive, part 3:
12064 -                * If there is a message or an error then accept it without
12065 -                * locking.
12066 -                */
12067 +               msg = (struct msg_msg *)msr_d.r_msg;
12068                 if (msg != ERR_PTR(-EAGAIN))
12069                         goto out_unlock1;
12070
12071 -               /* Lockless receive, part 3:
12072 -                * Acquire the queue spinlock.
12073 -                */
12074 +                /*
12075 +                 * … or see -EAGAIN, acquire the lock to check the message
12076 +                 * again.
12077 +                 */
12078                 ipc_lock_object(&msq->q_perm);
12079
12080 -               /* Lockless receive, part 4:
12081 -                * Repeat test after acquiring the spinlock.
12082 -                */
12083                 msg = (struct msg_msg *)msr_d.r_msg;
12084                 if (msg != ERR_PTR(-EAGAIN))
12085                         goto out_unlock0;
12086 diff --git a/ipc/sem.c b/ipc/sem.c
12087 index 5e318c5f749d..ec9203971539 100644
12088 --- a/ipc/sem.c
12089 +++ b/ipc/sem.c
12090 @@ -712,6 +712,13 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
12091  static void wake_up_sem_queue_prepare(struct list_head *pt,
12092                                 struct sem_queue *q, int error)
12093  {
12094 +#ifdef CONFIG_PREEMPT_RT_BASE
12095 +       struct task_struct *p = q->sleeper;
12096 +       get_task_struct(p);
12097 +       q->status = error;
12098 +       wake_up_process(p);
12099 +       put_task_struct(p);
12100 +#else
12101         if (list_empty(pt)) {
12102                 /*
12103                  * Hold preempt off so that we don't get preempted and have the
12104 @@ -723,6 +730,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
12105         q->pid = error;
12106
12107         list_add_tail(&q->list, pt);
12108 +#endif
12109  }
12110
12111  /**
12112 @@ -736,6 +744,7 @@ static void wake_up_sem_queue_prepare(struct list_head *pt,
12113   */
12114  static void wake_up_sem_queue_do(struct list_head *pt)
12115  {
12116 +#ifndef CONFIG_PREEMPT_RT_BASE
12117         struct sem_queue *q, *t;
12118         int did_something;
12119
12120 @@ -748,6 +757,7 @@ static void wake_up_sem_queue_do(struct list_head *pt)
12121         }
12122         if (did_something)
12123                 preempt_enable();
12124 +#endif
12125  }
12126
12127  static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
12128 diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
12129 index ebdb0043203a..b9e6aa7e5aa6 100644
12130 --- a/kernel/Kconfig.locks
12131 +++ b/kernel/Kconfig.locks
12132 @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW
12133
12134  config MUTEX_SPIN_ON_OWNER
12135         def_bool y
12136 -       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
12137 +       depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
12138
12139  config RWSEM_SPIN_ON_OWNER
12140         def_bool y
12141 -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
12142 +       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL
12143
12144  config LOCK_SPIN_ON_OWNER
12145         def_bool y
12146 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
12147 index 3f9c97419f02..11dbe26a8279 100644
12148 --- a/kernel/Kconfig.preempt
12149 +++ b/kernel/Kconfig.preempt
12150 @@ -1,3 +1,16 @@
12151 +config PREEMPT
12152 +       bool
12153 +       select PREEMPT_COUNT
12154 +
12155 +config PREEMPT_RT_BASE
12156 +       bool
12157 +       select PREEMPT
12158 +
12159 +config HAVE_PREEMPT_LAZY
12160 +       bool
12161 +
12162 +config PREEMPT_LAZY
12163 +       def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL
12164
12165  choice
12166         prompt "Preemption Model"
12167 @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY
12168
12169           Select this if you are building a kernel for a desktop system.
12170
12171 -config PREEMPT
12172 +config PREEMPT__LL
12173         bool "Preemptible Kernel (Low-Latency Desktop)"
12174 -       select PREEMPT_COUNT
12175 +       select PREEMPT
12176         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
12177         help
12178           This option reduces the latency of the kernel by making
12179 @@ -52,6 +65,22 @@ config PREEMPT
12180           embedded system with latency requirements in the milliseconds
12181           range.
12182
12183 +config PREEMPT_RTB
12184 +       bool "Preemptible Kernel (Basic RT)"
12185 +       select PREEMPT_RT_BASE
12186 +       help
12187 +         This option is basically the same as (Low-Latency Desktop) but
12188 +         enables changes which are preliminary for the full preemptible
12189 +         RT kernel.
12190 +
12191 +config PREEMPT_RT_FULL
12192 +       bool "Fully Preemptible Kernel (RT)"
12193 +       depends on IRQ_FORCED_THREADING
12194 +       select PREEMPT_RT_BASE
12195 +       select PREEMPT_RCU
12196 +       help
12197 +         All and everything
12198 +
12199  endchoice
12200
12201  config PREEMPT_COUNT
12202 diff --git a/kernel/Makefile b/kernel/Makefile
12203 index e2ec54e2b952..bff8214bf5f6 100644
12204 --- a/kernel/Makefile
12205 +++ b/kernel/Makefile
12206 @@ -11,6 +11,13 @@ obj-y     = fork.o exec_domain.o panic.o \
12207             notifier.o ksysfs.o cred.o reboot.o \
12208             async.o range.o smpboot.o
12209
12210 +# Tracing may do some dangerous __builtin_return_address() operations
12211 +# We know they are dangerous, we don't need gcc telling us that.
12212 +ifdef CONFIG_USING_GET_LOCK_PARENT_IP
12213 +FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
12214 +KBUILD_CFLAGS += $(FRAME_CFLAGS)
12215 +endif
12216 +
12217  obj-$(CONFIG_MULTIUSER) += groups.o
12218
12219  ifdef CONFIG_FUNCTION_TRACER
12220 diff --git a/kernel/cgroup.c b/kernel/cgroup.c
12221 index d6b729beba49..11d61b2ca938 100644
12222 --- a/kernel/cgroup.c
12223 +++ b/kernel/cgroup.c
12224 @@ -5027,10 +5027,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
12225         queue_work(cgroup_destroy_wq, &css->destroy_work);
12226  }
12227
12228 -static void css_release_work_fn(struct work_struct *work)
12229 +static void css_release_work_fn(struct swork_event *sev)
12230  {
12231         struct cgroup_subsys_state *css =
12232 -               container_of(work, struct cgroup_subsys_state, destroy_work);
12233 +               container_of(sev, struct cgroup_subsys_state, destroy_swork);
12234         struct cgroup_subsys *ss = css->ss;
12235         struct cgroup *cgrp = css->cgroup;
12236
12237 @@ -5071,8 +5071,8 @@ static void css_release(struct percpu_ref *ref)
12238         struct cgroup_subsys_state *css =
12239                 container_of(ref, struct cgroup_subsys_state, refcnt);
12240
12241 -       INIT_WORK(&css->destroy_work, css_release_work_fn);
12242 -       queue_work(cgroup_destroy_wq, &css->destroy_work);
12243 +       INIT_SWORK(&css->destroy_swork, css_release_work_fn);
12244 +       swork_queue(&css->destroy_swork);
12245  }
12246
12247  static void init_and_link_css(struct cgroup_subsys_state *css,
12248 @@ -5716,6 +5716,7 @@ static int __init cgroup_wq_init(void)
12249          */
12250         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
12251         BUG_ON(!cgroup_destroy_wq);
12252 +       BUG_ON(swork_get());
12253
12254         /*
12255          * Used to destroy pidlists and separate to serve as flush domain.
12256 diff --git a/kernel/cpu.c b/kernel/cpu.c
12257 index 341bf80f80bd..b575429a8a00 100644
12258 --- a/kernel/cpu.c
12259 +++ b/kernel/cpu.c
12260 @@ -152,8 +152,8 @@ static struct {
12261  #endif
12262  } cpu_hotplug = {
12263         .active_writer = NULL,
12264 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
12265         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
12266 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
12267  #ifdef CONFIG_DEBUG_LOCK_ALLOC
12268         .dep_map = {.name = "cpu_hotplug.lock" },
12269  #endif
12270 @@ -166,6 +166,289 @@ static struct {
12271  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
12272  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
12273
12274 +/**
12275 + * hotplug_pcp - per cpu hotplug descriptor
12276 + * @unplug:    set when pin_current_cpu() needs to sync tasks
12277 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
12278 + * @refcount:  counter of tasks in pinned sections
12279 + * @grab_lock: set when the tasks entering pinned sections should wait
12280 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
12281 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
12282 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
12283 + *
12284 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
12285 + * is used as a flag and still exists after @sync_tsk has exited and
12286 + * @sync_tsk set to NULL.
12287 + */
12288 +struct hotplug_pcp {
12289 +       struct task_struct *unplug;
12290 +       struct task_struct *sync_tsk;
12291 +       int refcount;
12292 +       int grab_lock;
12293 +       struct completion synced;
12294 +       struct completion unplug_wait;
12295 +#ifdef CONFIG_PREEMPT_RT_FULL
12296 +       /*
12297 +        * Note, on PREEMPT_RT, the hotplug lock must save the state of
12298 +        * the task, otherwise the mutex will cause the task to fail
12299 +        * to sleep when required. (Because it's called from migrate_disable())
12300 +        *
12301 +        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
12302 +        * state.
12303 +        */
12304 +       spinlock_t lock;
12305 +#else
12306 +       struct mutex mutex;
12307 +#endif
12308 +       int mutex_init;
12309 +};
12310 +
12311 +#ifdef CONFIG_PREEMPT_RT_FULL
12312 +# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
12313 +# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
12314 +#else
12315 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
12316 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
12317 +#endif
12318 +
12319 +static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
12320 +
12321 +/**
12322 + * pin_current_cpu - Prevent the current cpu from being unplugged
12323 + *
12324 + * Lightweight version of get_online_cpus() to prevent cpu from being
12325 + * unplugged when code runs in a migration disabled region.
12326 + *
12327 + * Must be called with preemption disabled (preempt_count = 1)!
12328 + */
12329 +void pin_current_cpu(void)
12330 +{
12331 +       struct hotplug_pcp *hp;
12332 +       int force = 0;
12333 +
12334 +retry:
12335 +       hp = this_cpu_ptr(&hotplug_pcp);
12336 +
12337 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
12338 +           hp->unplug == current) {
12339 +               hp->refcount++;
12340 +               return;
12341 +       }
12342 +       if (hp->grab_lock) {
12343 +               preempt_enable();
12344 +               hotplug_lock(hp);
12345 +               hotplug_unlock(hp);
12346 +       } else {
12347 +               preempt_enable();
12348 +               /*
12349 +                * Try to push this task off of this CPU.
12350 +                */
12351 +               if (!migrate_me()) {
12352 +                       preempt_disable();
12353 +                       hp = this_cpu_ptr(&hotplug_pcp);
12354 +                       if (!hp->grab_lock) {
12355 +                               /*
12356 +                                * Just let it continue it's already pinned
12357 +                                * or about to sleep.
12358 +                                */
12359 +                               force = 1;
12360 +                               goto retry;
12361 +                       }
12362 +                       preempt_enable();
12363 +               }
12364 +       }
12365 +       preempt_disable();
12366 +       goto retry;
12367 +}
12368 +
12369 +/**
12370 + * unpin_current_cpu - Allow unplug of current cpu
12371 + *
12372 + * Must be called with preemption or interrupts disabled!
12373 + */
12374 +void unpin_current_cpu(void)
12375 +{
12376 +       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
12377 +
12378 +       WARN_ON(hp->refcount <= 0);
12379 +
12380 +       /* This is safe. sync_unplug_thread is pinned to this cpu */
12381 +       if (!--hp->refcount && hp->unplug && hp->unplug != current)
12382 +               wake_up_process(hp->unplug);
12383 +}
12384 +
12385 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
12386 +{
12387 +       set_current_state(TASK_UNINTERRUPTIBLE);
12388 +       while (hp->refcount) {
12389 +               schedule_preempt_disabled();
12390 +               set_current_state(TASK_UNINTERRUPTIBLE);
12391 +       }
12392 +}
12393 +
12394 +static int sync_unplug_thread(void *data)
12395 +{
12396 +       struct hotplug_pcp *hp = data;
12397 +
12398 +       wait_for_completion(&hp->unplug_wait);
12399 +       preempt_disable();
12400 +       hp->unplug = current;
12401 +       wait_for_pinned_cpus(hp);
12402 +
12403 +       /*
12404 +        * This thread will synchronize the cpu_down() with threads
12405 +        * that have pinned the CPU. When the pinned CPU count reaches
12406 +        * zero, we inform the cpu_down code to continue to the next step.
12407 +        */
12408 +       set_current_state(TASK_UNINTERRUPTIBLE);
12409 +       preempt_enable();
12410 +       complete(&hp->synced);
12411 +
12412 +       /*
12413 +        * If all succeeds, the next step will need tasks to wait till
12414 +        * the CPU is offline before continuing. To do this, the grab_lock
12415 +        * is set and tasks going into pin_current_cpu() will block on the
12416 +        * mutex. But we still need to wait for those that are already in
12417 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
12418 +        * will kick this thread out.
12419 +        */
12420 +       while (!hp->grab_lock && !kthread_should_stop()) {
12421 +               schedule();
12422 +               set_current_state(TASK_UNINTERRUPTIBLE);
12423 +       }
12424 +
12425 +       /* Make sure grab_lock is seen before we see a stale completion */
12426 +       smp_mb();
12427 +
12428 +       /*
12429 +        * Now just before cpu_down() enters stop machine, we need to make
12430 +        * sure all tasks that are in pinned CPU sections are out, and new
12431 +        * tasks will now grab the lock, keeping them from entering pinned
12432 +        * CPU sections.
12433 +        */
12434 +       if (!kthread_should_stop()) {
12435 +               preempt_disable();
12436 +               wait_for_pinned_cpus(hp);
12437 +               preempt_enable();
12438 +               complete(&hp->synced);
12439 +       }
12440 +
12441 +       set_current_state(TASK_UNINTERRUPTIBLE);
12442 +       while (!kthread_should_stop()) {
12443 +               schedule();
12444 +               set_current_state(TASK_UNINTERRUPTIBLE);
12445 +       }
12446 +       set_current_state(TASK_RUNNING);
12447 +
12448 +       /*
12449 +        * Force this thread off this CPU as it's going down and
12450 +        * we don't want any more work on this CPU.
12451 +        */
12452 +       current->flags &= ~PF_NO_SETAFFINITY;
12453 +       set_cpus_allowed_ptr(current, cpu_present_mask);
12454 +       migrate_me();
12455 +       return 0;
12456 +}
12457 +
12458 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
12459 +{
12460 +       wake_up_process(hp->sync_tsk);
12461 +       wait_for_completion(&hp->synced);
12462 +}
12463 +
12464 +static void __cpu_unplug_wait(unsigned int cpu)
12465 +{
12466 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
12467 +
12468 +       complete(&hp->unplug_wait);
12469 +       wait_for_completion(&hp->synced);
12470 +}
12471 +
12472 +/*
12473 + * Start the sync_unplug_thread on the target cpu and wait for it to
12474 + * complete.
12475 + */
12476 +static int cpu_unplug_begin(unsigned int cpu)
12477 +{
12478 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
12479 +       int err;
12480 +
12481 +       /* Protected by cpu_hotplug.lock */
12482 +       if (!hp->mutex_init) {
12483 +#ifdef CONFIG_PREEMPT_RT_FULL
12484 +               spin_lock_init(&hp->lock);
12485 +#else
12486 +               mutex_init(&hp->mutex);
12487 +#endif
12488 +               hp->mutex_init = 1;
12489 +       }
12490 +
12491 +       /* Inform the scheduler to migrate tasks off this CPU */
12492 +       tell_sched_cpu_down_begin(cpu);
12493 +
12494 +       init_completion(&hp->synced);
12495 +       init_completion(&hp->unplug_wait);
12496 +
12497 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
12498 +       if (IS_ERR(hp->sync_tsk)) {
12499 +               err = PTR_ERR(hp->sync_tsk);
12500 +               hp->sync_tsk = NULL;
12501 +               return err;
12502 +       }
12503 +       kthread_bind(hp->sync_tsk, cpu);
12504 +
12505 +       /*
12506 +        * Wait for tasks to get out of the pinned sections,
12507 +        * it's still OK if new tasks enter. Some CPU notifiers will
12508 +        * wait for tasks that are going to enter these sections and
12509 +        * we must not have them block.
12510 +        */
12511 +       wake_up_process(hp->sync_tsk);
12512 +       return 0;
12513 +}
12514 +
12515 +static void cpu_unplug_sync(unsigned int cpu)
12516 +{
12517 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
12518 +
12519 +       init_completion(&hp->synced);
12520 +       /* The completion needs to be initialzied before setting grab_lock */
12521 +       smp_wmb();
12522 +
12523 +       /* Grab the mutex before setting grab_lock */
12524 +       hotplug_lock(hp);
12525 +       hp->grab_lock = 1;
12526 +
12527 +       /*
12528 +        * The CPU notifiers have been completed.
12529 +        * Wait for tasks to get out of pinned CPU sections and have new
12530 +        * tasks block until the CPU is completely down.
12531 +        */
12532 +       __cpu_unplug_sync(hp);
12533 +
12534 +       /* All done with the sync thread */
12535 +       kthread_stop(hp->sync_tsk);
12536 +       hp->sync_tsk = NULL;
12537 +}
12538 +
12539 +static void cpu_unplug_done(unsigned int cpu)
12540 +{
12541 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
12542 +
12543 +       hp->unplug = NULL;
12544 +       /* Let all tasks know cpu unplug is finished before cleaning up */
12545 +       smp_wmb();
12546 +
12547 +       if (hp->sync_tsk)
12548 +               kthread_stop(hp->sync_tsk);
12549 +
12550 +       if (hp->grab_lock) {
12551 +               hotplug_unlock(hp);
12552 +               /* protected by cpu_hotplug.lock */
12553 +               hp->grab_lock = 0;
12554 +       }
12555 +       tell_sched_cpu_down_done(cpu);
12556 +}
12557
12558  void get_online_cpus(void)
12559  {
12560 @@ -710,10 +993,14 @@ static int takedown_cpu(unsigned int cpu)
12561         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
12562         int err;
12563
12564 +       __cpu_unplug_wait(cpu);
12565         /* Park the smpboot threads */
12566         kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
12567         smpboot_park_threads(cpu);
12568
12569 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
12570 +       cpu_unplug_sync(cpu);
12571 +
12572         /*
12573          * Prevent irq alloc/free while the dying cpu reorganizes the
12574          * interrupt affinities.
12575 @@ -799,6 +1086,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
12576         struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
12577         int prev_state, ret = 0;
12578         bool hasdied = false;
12579 +       int mycpu;
12580 +       cpumask_var_t cpumask;
12581 +       cpumask_var_t cpumask_org;
12582
12583         if (num_online_cpus() == 1)
12584                 return -EBUSY;
12585 @@ -806,7 +1096,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
12586         if (!cpu_present(cpu))
12587                 return -EINVAL;
12588
12589 +       /* Move the downtaker off the unplug cpu */
12590 +       if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
12591 +               return -ENOMEM;
12592 +       if (!alloc_cpumask_var(&cpumask_org, GFP_KERNEL))  {
12593 +               free_cpumask_var(cpumask);
12594 +               return -ENOMEM;
12595 +       }
12596 +
12597 +       cpumask_copy(cpumask_org, tsk_cpus_allowed(current));
12598 +       cpumask_andnot(cpumask, cpu_online_mask, cpumask_of(cpu));
12599 +       set_cpus_allowed_ptr(current, cpumask);
12600 +       free_cpumask_var(cpumask);
12601 +       migrate_disable();
12602 +       mycpu = smp_processor_id();
12603 +       if (mycpu == cpu) {
12604 +               printk(KERN_ERR "Yuck! Still on unplug CPU\n!");
12605 +               migrate_enable();
12606 +               ret = -EBUSY;
12607 +               goto restore_cpus;
12608 +       }
12609 +
12610 +       migrate_enable();
12611         cpu_hotplug_begin();
12612 +       ret = cpu_unplug_begin(cpu);
12613 +       if (ret) {
12614 +               printk("cpu_unplug_begin(%d) failed\n", cpu);
12615 +               goto out_cancel;
12616 +       }
12617
12618         cpuhp_tasks_frozen = tasks_frozen;
12619
12620 @@ -845,10 +1162,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
12621
12622         hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
12623  out:
12624 +       cpu_unplug_done(cpu);
12625 +out_cancel:
12626         cpu_hotplug_done();
12627         /* This post dead nonsense must die */
12628         if (!ret && hasdied)
12629                 cpu_notify_nofail(CPU_POST_DEAD, cpu);
12630 +restore_cpus:
12631 +       set_cpus_allowed_ptr(current, cpumask_org);
12632 +       free_cpumask_var(cpumask_org);
12633         return ret;
12634  }
12635
12636 diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
12637 index fc1ef736253c..83c666537a7a 100644
12638 --- a/kernel/debug/kdb/kdb_io.c
12639 +++ b/kernel/debug/kdb/kdb_io.c
12640 @@ -554,7 +554,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12641         int linecount;
12642         int colcount;
12643         int logging, saved_loglevel = 0;
12644 -       int saved_trap_printk;
12645         int got_printf_lock = 0;
12646         int retlen = 0;
12647         int fnd, len;
12648 @@ -565,8 +564,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12649         unsigned long uninitialized_var(flags);
12650
12651         preempt_disable();
12652 -       saved_trap_printk = kdb_trap_printk;
12653 -       kdb_trap_printk = 0;
12654
12655         /* Serialize kdb_printf if multiple cpus try to write at once.
12656          * But if any cpu goes recursive in kdb, just print the output,
12657 @@ -855,7 +852,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
12658         } else {
12659                 __release(kdb_printf_lock);
12660         }
12661 -       kdb_trap_printk = saved_trap_printk;
12662         preempt_enable();
12663         return retlen;
12664  }
12665 @@ -865,9 +861,11 @@ int kdb_printf(const char *fmt, ...)
12666         va_list ap;
12667         int r;
12668
12669 +       kdb_trap_printk++;
12670         va_start(ap, fmt);
12671         r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap);
12672         va_end(ap);
12673 +       kdb_trap_printk--;
12674
12675         return r;
12676  }
12677 diff --git a/kernel/events/core.c b/kernel/events/core.c
12678 index fc9bb2225291..bc2db7e1ae04 100644
12679 --- a/kernel/events/core.c
12680 +++ b/kernel/events/core.c
12681 @@ -1042,6 +1042,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
12682         raw_spin_lock_init(&cpuctx->hrtimer_lock);
12683         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
12684         timer->function = perf_mux_hrtimer_handler;
12685 +       timer->irqsafe = 1;
12686  }
12687
12688  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
12689 @@ -8215,6 +8216,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
12690
12691         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
12692         hwc->hrtimer.function = perf_swevent_hrtimer;
12693 +       hwc->hrtimer.irqsafe = 1;
12694
12695         /*
12696          * Since hrtimers have a fixed rate, we can do a static freq->period
12697 diff --git a/kernel/exit.c b/kernel/exit.c
12698 index 091a78be3b09..170b672bbb38 100644
12699 --- a/kernel/exit.c
12700 +++ b/kernel/exit.c
12701 @@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk)
12702          * Do this under ->siglock, we can race with another thread
12703          * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
12704          */
12705 -       flush_sigqueue(&tsk->pending);
12706 +       flush_task_sigqueue(tsk);
12707         tsk->sighand = NULL;
12708         spin_unlock(&sighand->siglock);
12709
12710 diff --git a/kernel/fork.c b/kernel/fork.c
12711 index beb31725f7e2..e398cb9e62fa 100644
12712 --- a/kernel/fork.c
12713 +++ b/kernel/fork.c
12714 @@ -251,7 +251,9 @@ static inline void put_signal_struct(struct signal_struct *sig)
12715         if (atomic_dec_and_test(&sig->sigcnt))
12716                 free_signal_struct(sig);
12717  }
12718 -
12719 +#ifdef CONFIG_PREEMPT_RT_BASE
12720 +static
12721 +#endif
12722  void __put_task_struct(struct task_struct *tsk)
12723  {
12724         WARN_ON(!tsk->exit_state);
12725 @@ -268,7 +270,18 @@ void __put_task_struct(struct task_struct *tsk)
12726         if (!profile_handoff_task(tsk))
12727                 free_task(tsk);
12728  }
12729 +#ifndef CONFIG_PREEMPT_RT_BASE
12730  EXPORT_SYMBOL_GPL(__put_task_struct);
12731 +#else
12732 +void __put_task_struct_cb(struct rcu_head *rhp)
12733 +{
12734 +       struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu);
12735 +
12736 +       __put_task_struct(tsk);
12737 +
12738 +}
12739 +EXPORT_SYMBOL_GPL(__put_task_struct_cb);
12740 +#endif
12741
12742  void __init __weak arch_task_cache_init(void) { }
12743
12744 @@ -702,6 +715,19 @@ void __mmdrop(struct mm_struct *mm)
12745  }
12746  EXPORT_SYMBOL_GPL(__mmdrop);
12747
12748 +#ifdef CONFIG_PREEMPT_RT_BASE
12749 +/*
12750 + * RCU callback for delayed mm drop. Not strictly rcu, but we don't
12751 + * want another facility to make this work.
12752 + */
12753 +void __mmdrop_delayed(struct rcu_head *rhp)
12754 +{
12755 +       struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);
12756 +
12757 +       __mmdrop(mm);
12758 +}
12759 +#endif
12760 +
12761  static inline void __mmput(struct mm_struct *mm)
12762  {
12763         VM_BUG_ON(atomic_read(&mm->mm_users));
12764 @@ -1274,6 +1300,9 @@ static void rt_mutex_init_task(struct task_struct *p)
12765   */
12766  static void posix_cpu_timers_init(struct task_struct *tsk)
12767  {
12768 +#ifdef CONFIG_PREEMPT_RT_BASE
12769 +       tsk->posix_timer_list = NULL;
12770 +#endif
12771         tsk->cputime_expires.prof_exp = 0;
12772         tsk->cputime_expires.virt_exp = 0;
12773         tsk->cputime_expires.sched_exp = 0;
12774 @@ -1399,6 +1428,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
12775         spin_lock_init(&p->alloc_lock);
12776
12777         init_sigpending(&p->pending);
12778 +       p->sigqueue_cache = NULL;
12779
12780         p->utime = p->stime = p->gtime = 0;
12781         p->utimescaled = p->stimescaled = 0;
12782 diff --git a/kernel/futex.c b/kernel/futex.c
12783 index 46cb3a301bc1..6de82b959729 100644
12784 --- a/kernel/futex.c
12785 +++ b/kernel/futex.c
12786 @@ -895,7 +895,9 @@ void exit_pi_state_list(struct task_struct *curr)
12787                  * task still owns the PI-state:
12788                  */
12789                 if (head->next != next) {
12790 +                       raw_spin_unlock_irq(&curr->pi_lock);
12791                         spin_unlock(&hb->lock);
12792 +                       raw_spin_lock_irq(&curr->pi_lock);
12793                         continue;
12794                 }
12795
12796 @@ -1290,6 +1292,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12797         struct futex_pi_state *pi_state = this->pi_state;
12798         u32 uninitialized_var(curval), newval;
12799         WAKE_Q(wake_q);
12800 +       WAKE_Q(wake_sleeper_q);
12801         bool deboost;
12802         int ret = 0;
12803
12804 @@ -1356,7 +1359,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12805
12806         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
12807
12808 -       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
12809 +       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
12810 +                                       &wake_sleeper_q);
12811
12812         /*
12813          * First unlock HB so the waiter does not spin on it once he got woken
12814 @@ -1364,8 +1368,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
12815          * deboost first (and lose our higher priority), then the task might get
12816          * scheduled away before the wake up can take place.
12817          */
12818 -       spin_unlock(&hb->lock);
12819 +       deboost |= spin_unlock_no_deboost(&hb->lock);
12820         wake_up_q(&wake_q);
12821 +       wake_up_q_sleeper(&wake_sleeper_q);
12822         if (deboost)
12823                 rt_mutex_adjust_prio(current);
12824
12825 @@ -1915,6 +1920,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
12826                                 requeue_pi_wake_futex(this, &key2, hb2);
12827                                 drop_count++;
12828                                 continue;
12829 +                       } else if (ret == -EAGAIN) {
12830 +                               /*
12831 +                                * Waiter was woken by timeout or
12832 +                                * signal and has set pi_blocked_on to
12833 +                                * PI_WAKEUP_INPROGRESS before we
12834 +                                * tried to enqueue it on the rtmutex.
12835 +                                */
12836 +                               this->pi_state = NULL;
12837 +                               put_pi_state(pi_state);
12838 +                               continue;
12839                         } else if (ret) {
12840                                 /*
12841                                  * rt_mutex_start_proxy_lock() detected a
12842 @@ -2805,7 +2820,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12843         struct hrtimer_sleeper timeout, *to = NULL;
12844         struct rt_mutex_waiter rt_waiter;
12845         struct rt_mutex *pi_mutex = NULL;
12846 -       struct futex_hash_bucket *hb;
12847 +       struct futex_hash_bucket *hb, *hb2;
12848         union futex_key key2 = FUTEX_KEY_INIT;
12849         struct futex_q q = futex_q_init;
12850         int res, ret;
12851 @@ -2830,10 +2845,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12852          * The waiter is allocated on our stack, manipulated by the requeue
12853          * code while we sleep on uaddr.
12854          */
12855 -       debug_rt_mutex_init_waiter(&rt_waiter);
12856 -       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
12857 -       RB_CLEAR_NODE(&rt_waiter.tree_entry);
12858 -       rt_waiter.task = NULL;
12859 +       rt_mutex_init_waiter(&rt_waiter, false);
12860
12861         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
12862         if (unlikely(ret != 0))
12863 @@ -2864,20 +2876,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12864         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
12865         futex_wait_queue_me(hb, &q, to);
12866
12867 -       spin_lock(&hb->lock);
12868 -       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12869 -       spin_unlock(&hb->lock);
12870 -       if (ret)
12871 -               goto out_put_keys;
12872 +       /*
12873 +        * On RT we must avoid races with requeue and trying to block
12874 +        * on two mutexes (hb->lock and uaddr2's rtmutex) by
12875 +        * serializing access to pi_blocked_on with pi_lock.
12876 +        */
12877 +       raw_spin_lock_irq(&current->pi_lock);
12878 +       if (current->pi_blocked_on) {
12879 +               /*
12880 +                * We have been requeued or are in the process of
12881 +                * being requeued.
12882 +                */
12883 +               raw_spin_unlock_irq(&current->pi_lock);
12884 +       } else {
12885 +               /*
12886 +                * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
12887 +                * prevents a concurrent requeue from moving us to the
12888 +                * uaddr2 rtmutex. After that we can safely acquire
12889 +                * (and possibly block on) hb->lock.
12890 +                */
12891 +               current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
12892 +               raw_spin_unlock_irq(&current->pi_lock);
12893 +
12894 +               spin_lock(&hb->lock);
12895 +
12896 +               /*
12897 +                * Clean up pi_blocked_on. We might leak it otherwise
12898 +                * when we succeeded with the hb->lock in the fast
12899 +                * path.
12900 +                */
12901 +               raw_spin_lock_irq(&current->pi_lock);
12902 +               current->pi_blocked_on = NULL;
12903 +               raw_spin_unlock_irq(&current->pi_lock);
12904 +
12905 +               ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
12906 +               spin_unlock(&hb->lock);
12907 +               if (ret)
12908 +                       goto out_put_keys;
12909 +       }
12910
12911         /*
12912 -        * In order for us to be here, we know our q.key == key2, and since
12913 -        * we took the hb->lock above, we also know that futex_requeue() has
12914 -        * completed and we no longer have to concern ourselves with a wakeup
12915 -        * race with the atomic proxy lock acquisition by the requeue code. The
12916 -        * futex_requeue dropped our key1 reference and incremented our key2
12917 -        * reference count.
12918 +        * In order to be here, we have either been requeued, are in
12919 +        * the process of being requeued, or requeue successfully
12920 +        * acquired uaddr2 on our behalf.  If pi_blocked_on was
12921 +        * non-null above, we may be racing with a requeue.  Do not
12922 +        * rely on q->lock_ptr to be hb2->lock until after blocking on
12923 +        * hb->lock or hb2->lock. The futex_requeue dropped our key1
12924 +        * reference and incremented our key2 reference count.
12925          */
12926 +       hb2 = hash_futex(&key2);
12927
12928         /* Check if the requeue code acquired the second futex for us. */
12929         if (!q.rt_waiter) {
12930 @@ -2886,14 +2933,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12931                  * did a lock-steal - fix up the PI-state in that case.
12932                  */
12933                 if (q.pi_state && (q.pi_state->owner != current)) {
12934 -                       spin_lock(q.lock_ptr);
12935 +                       spin_lock(&hb2->lock);
12936 +                       BUG_ON(&hb2->lock != q.lock_ptr);
12937                         ret = fixup_pi_state_owner(uaddr2, &q, current);
12938                         /*
12939                          * Drop the reference to the pi state which
12940                          * the requeue_pi() code acquired for us.
12941                          */
12942                         put_pi_state(q.pi_state);
12943 -                       spin_unlock(q.lock_ptr);
12944 +                       spin_unlock(&hb2->lock);
12945                 }
12946         } else {
12947                 /*
12948 @@ -2906,7 +2954,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
12949                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
12950                 debug_rt_mutex_free_waiter(&rt_waiter);
12951
12952 -               spin_lock(q.lock_ptr);
12953 +               spin_lock(&hb2->lock);
12954 +               BUG_ON(&hb2->lock != q.lock_ptr);
12955                 /*
12956                  * Fixup the pi_state owner and possibly acquire the lock if we
12957                  * haven't already.
12958 diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
12959 index d3f24905852c..f87aa8fdcc51 100644
12960 --- a/kernel/irq/handle.c
12961 +++ b/kernel/irq/handle.c
12962 @@ -181,10 +181,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
12963  {
12964         irqreturn_t retval;
12965         unsigned int flags = 0;
12966 +       struct pt_regs *regs = get_irq_regs();
12967 +       u64 ip = regs ? instruction_pointer(regs) : 0;
12968
12969         retval = __handle_irq_event_percpu(desc, &flags);
12970
12971 -       add_interrupt_randomness(desc->irq_data.irq, flags);
12972 +#ifdef CONFIG_PREEMPT_RT_FULL
12973 +       desc->random_ip = ip;
12974 +#else
12975 +       add_interrupt_randomness(desc->irq_data.irq, flags, ip);
12976 +#endif
12977
12978         if (!noirqdebug)
12979                 note_interrupt(desc, retval);
12980 diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
12981 index 9530fcd27704..fadf8f848299 100644
12982 --- a/kernel/irq/manage.c
12983 +++ b/kernel/irq/manage.c
12984 @@ -22,6 +22,7 @@
12985  #include "internals.h"
12986
12987  #ifdef CONFIG_IRQ_FORCED_THREADING
12988 +# ifndef CONFIG_PREEMPT_RT_BASE
12989  __read_mostly bool force_irqthreads;
12990
12991  static int __init setup_forced_irqthreads(char *arg)
12992 @@ -30,6 +31,7 @@ static int __init setup_forced_irqthreads(char *arg)
12993         return 0;
12994  }
12995  early_param("threadirqs", setup_forced_irqthreads);
12996 +# endif
12997  #endif
12998
12999  static void __synchronize_hardirq(struct irq_desc *desc)
13000 @@ -233,7 +235,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
13001
13002         if (desc->affinity_notify) {
13003                 kref_get(&desc->affinity_notify->kref);
13004 +
13005 +#ifdef CONFIG_PREEMPT_RT_BASE
13006 +               swork_queue(&desc->affinity_notify->swork);
13007 +#else
13008                 schedule_work(&desc->affinity_notify->work);
13009 +#endif
13010         }
13011         irqd_set(data, IRQD_AFFINITY_SET);
13012
13013 @@ -271,10 +278,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
13014  }
13015  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
13016
13017 -static void irq_affinity_notify(struct work_struct *work)
13018 +static void _irq_affinity_notify(struct irq_affinity_notify *notify)
13019  {
13020 -       struct irq_affinity_notify *notify =
13021 -               container_of(work, struct irq_affinity_notify, work);
13022         struct irq_desc *desc = irq_to_desc(notify->irq);
13023         cpumask_var_t cpumask;
13024         unsigned long flags;
13025 @@ -296,6 +301,35 @@ static void irq_affinity_notify(struct work_struct *work)
13026         kref_put(&notify->kref, notify->release);
13027  }
13028
13029 +#ifdef CONFIG_PREEMPT_RT_BASE
13030 +static void init_helper_thread(void)
13031 +{
13032 +       static int init_sworker_once;
13033 +
13034 +       if (init_sworker_once)
13035 +               return;
13036 +       if (WARN_ON(swork_get()))
13037 +               return;
13038 +       init_sworker_once = 1;
13039 +}
13040 +
13041 +static void irq_affinity_notify(struct swork_event *swork)
13042 +{
13043 +       struct irq_affinity_notify *notify =
13044 +               container_of(swork, struct irq_affinity_notify, swork);
13045 +       _irq_affinity_notify(notify);
13046 +}
13047 +
13048 +#else
13049 +
13050 +static void irq_affinity_notify(struct work_struct *work)
13051 +{
13052 +       struct irq_affinity_notify *notify =
13053 +               container_of(work, struct irq_affinity_notify, work);
13054 +       _irq_affinity_notify(notify);
13055 +}
13056 +#endif
13057 +
13058  /**
13059   *     irq_set_affinity_notifier - control notification of IRQ affinity changes
13060   *     @irq:           Interrupt for which to enable/disable notification
13061 @@ -324,7 +358,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
13062         if (notify) {
13063                 notify->irq = irq;
13064                 kref_init(&notify->kref);
13065 +#ifdef CONFIG_PREEMPT_RT_BASE
13066 +               INIT_SWORK(&notify->swork, irq_affinity_notify);
13067 +               init_helper_thread();
13068 +#else
13069                 INIT_WORK(&notify->work, irq_affinity_notify);
13070 +#endif
13071         }
13072
13073         raw_spin_lock_irqsave(&desc->lock, flags);
13074 @@ -879,7 +918,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
13075         local_bh_disable();
13076         ret = action->thread_fn(action->irq, action->dev_id);
13077         irq_finalize_oneshot(desc, action);
13078 -       local_bh_enable();
13079 +       /*
13080 +        * Interrupts which have real time requirements can be set up
13081 +        * to avoid softirq processing in the thread handler. This is
13082 +        * safe as these interrupts do not raise soft interrupts.
13083 +        */
13084 +       if (irq_settings_no_softirq_call(desc))
13085 +               _local_bh_enable();
13086 +       else
13087 +               local_bh_enable();
13088         return ret;
13089  }
13090
13091 @@ -976,6 +1023,12 @@ static int irq_thread(void *data)
13092                 if (action_ret == IRQ_WAKE_THREAD)
13093                         irq_wake_secondary(desc, action);
13094
13095 +#ifdef CONFIG_PREEMPT_RT_FULL
13096 +               migrate_disable();
13097 +               add_interrupt_randomness(action->irq, 0,
13098 +                                desc->random_ip ^ (unsigned long) action);
13099 +               migrate_enable();
13100 +#endif
13101                 wake_threads_waitq(desc);
13102         }
13103
13104 @@ -1336,6 +1389,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
13105                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
13106                 }
13107
13108 +               if (new->flags & IRQF_NO_SOFTIRQ_CALL)
13109 +                       irq_settings_set_no_softirq_call(desc);
13110 +
13111                 /* Set default affinity mask once everything is setup */
13112                 setup_affinity(desc, mask);
13113
13114 @@ -2061,7 +2117,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
13115   *     This call sets the internal irqchip state of an interrupt,
13116   *     depending on the value of @which.
13117   *
13118 - *     This function should be called with preemption disabled if the
13119 + *     This function should be called with migration disabled if the
13120   *     interrupt controller has per-cpu registers.
13121   */
13122  int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
13123 diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
13124 index 320579d89091..2df2d4445b1e 100644
13125 --- a/kernel/irq/settings.h
13126 +++ b/kernel/irq/settings.h
13127 @@ -16,6 +16,7 @@ enum {
13128         _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
13129         _IRQ_IS_POLLED          = IRQ_IS_POLLED,
13130         _IRQ_DISABLE_UNLAZY     = IRQ_DISABLE_UNLAZY,
13131 +       _IRQ_NO_SOFTIRQ_CALL    = IRQ_NO_SOFTIRQ_CALL,
13132         _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
13133  };
13134
13135 @@ -30,6 +31,7 @@ enum {
13136  #define IRQ_PER_CPU_DEVID      GOT_YOU_MORON
13137  #define IRQ_IS_POLLED          GOT_YOU_MORON
13138  #define IRQ_DISABLE_UNLAZY     GOT_YOU_MORON
13139 +#define IRQ_NO_SOFTIRQ_CALL    GOT_YOU_MORON
13140  #undef IRQF_MODIFY_MASK
13141  #define IRQF_MODIFY_MASK       GOT_YOU_MORON
13142
13143 @@ -40,6 +42,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
13144         desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
13145  }
13146
13147 +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc)
13148 +{
13149 +       return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL;
13150 +}
13151 +
13152 +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc)
13153 +{
13154 +       desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL;
13155 +}
13156 +
13157  static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
13158  {
13159         return desc->status_use_accessors & _IRQ_PER_CPU;
13160 diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
13161 index 5707f97a3e6a..73f38dc7a7fb 100644
13162 --- a/kernel/irq/spurious.c
13163 +++ b/kernel/irq/spurious.c
13164 @@ -442,6 +442,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
13165
13166  static int __init irqfixup_setup(char *str)
13167  {
13168 +#ifdef CONFIG_PREEMPT_RT_BASE
13169 +       pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13170 +       return 1;
13171 +#endif
13172         irqfixup = 1;
13173         printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
13174         printk(KERN_WARNING "This may impact system performance.\n");
13175 @@ -454,6 +458,10 @@ module_param(irqfixup, int, 0644);
13176
13177  static int __init irqpoll_setup(char *str)
13178  {
13179 +#ifdef CONFIG_PREEMPT_RT_BASE
13180 +       pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n");
13181 +       return 1;
13182 +#endif
13183         irqfixup = 2;
13184         printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
13185                                 "enabled\n");
13186 diff --git a/kernel/irq_work.c b/kernel/irq_work.c
13187 index bcf107ce0854..2899ba0d23d1 100644
13188 --- a/kernel/irq_work.c
13189 +++ b/kernel/irq_work.c
13190 @@ -17,6 +17,7 @@
13191  #include <linux/cpu.h>
13192  #include <linux/notifier.h>
13193  #include <linux/smp.h>
13194 +#include <linux/interrupt.h>
13195  #include <asm/processor.h>
13196
13197
13198 @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void)
13199   */
13200  bool irq_work_queue_on(struct irq_work *work, int cpu)
13201  {
13202 +       struct llist_head *list;
13203 +
13204         /* All work should have been flushed before going offline */
13205         WARN_ON_ONCE(cpu_is_offline(cpu));
13206
13207 @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
13208         if (!irq_work_claim(work))
13209                 return false;
13210
13211 -       if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
13212 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ))
13213 +               list = &per_cpu(lazy_list, cpu);
13214 +       else
13215 +               list = &per_cpu(raised_list, cpu);
13216 +
13217 +       if (llist_add(&work->llnode, list))
13218                 arch_send_call_function_single_ipi(cpu);
13219
13220         return true;
13221 @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on);
13222  /* Enqueue the irq work @work on the current CPU */
13223  bool irq_work_queue(struct irq_work *work)
13224  {
13225 +       struct llist_head *list;
13226 +       bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
13227 +
13228         /* Only queue if not already pending */
13229         if (!irq_work_claim(work))
13230                 return false;
13231 @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work)
13232         /* Queue the entry and raise the IPI if needed. */
13233         preempt_disable();
13234
13235 -       /* If the work is "lazy", handle it from next tick if any */
13236 -       if (work->flags & IRQ_WORK_LAZY) {
13237 -               if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
13238 -                   tick_nohz_tick_stopped())
13239 -                       arch_irq_work_raise();
13240 -       } else {
13241 -               if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
13242 +       lazy_work = work->flags & IRQ_WORK_LAZY;
13243 +
13244 +       if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ)))
13245 +               list = this_cpu_ptr(&lazy_list);
13246 +       else
13247 +               list = this_cpu_ptr(&raised_list);
13248 +
13249 +       if (llist_add(&work->llnode, list)) {
13250 +               if (!lazy_work || tick_nohz_tick_stopped())
13251                         arch_irq_work_raise();
13252         }
13253
13254 @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void)
13255         raised = this_cpu_ptr(&raised_list);
13256         lazy = this_cpu_ptr(&lazy_list);
13257
13258 -       if (llist_empty(raised) || arch_irq_work_has_interrupt())
13259 -               if (llist_empty(lazy))
13260 -                       return false;
13261 +       if (llist_empty(raised) && llist_empty(lazy))
13262 +               return false;
13263
13264         /* All work should have been flushed before going offline */
13265         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
13266 @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list)
13267         struct irq_work *work;
13268         struct llist_node *llnode;
13269
13270 -       BUG_ON(!irqs_disabled());
13271 +       BUG_ON_NONRT(!irqs_disabled());
13272
13273         if (llist_empty(list))
13274                 return;
13275 @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list)
13276  void irq_work_run(void)
13277  {
13278         irq_work_run_list(this_cpu_ptr(&raised_list));
13279 -       irq_work_run_list(this_cpu_ptr(&lazy_list));
13280 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) {
13281 +               /*
13282 +                * NOTE: we raise softirq via IPI for safety,
13283 +                * and execute in irq_work_tick() to move the
13284 +                * overhead from hard to soft irq context.
13285 +                */
13286 +               if (!llist_empty(this_cpu_ptr(&lazy_list)))
13287 +                       raise_softirq(TIMER_SOFTIRQ);
13288 +       } else
13289 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13290  }
13291  EXPORT_SYMBOL_GPL(irq_work_run);
13292
13293 @@ -179,8 +200,17 @@ void irq_work_tick(void)
13294
13295         if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
13296                 irq_work_run_list(raised);
13297 +
13298 +       if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
13299 +               irq_work_run_list(this_cpu_ptr(&lazy_list));
13300 +}
13301 +
13302 +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL)
13303 +void irq_work_tick_soft(void)
13304 +{
13305         irq_work_run_list(this_cpu_ptr(&lazy_list));
13306  }
13307 +#endif
13308
13309  /*
13310   * Synchronize against the irq_work @entry, ensures the entry is not
13311 diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
13312 index ee1bc1bb8feb..ddef07958840 100644
13313 --- a/kernel/ksysfs.c
13314 +++ b/kernel/ksysfs.c
13315 @@ -136,6 +136,15 @@ KERNEL_ATTR_RO(vmcoreinfo);
13316
13317  #endif /* CONFIG_KEXEC_CORE */
13318
13319 +#if defined(CONFIG_PREEMPT_RT_FULL)
13320 +static ssize_t  realtime_show(struct kobject *kobj,
13321 +                             struct kobj_attribute *attr, char *buf)
13322 +{
13323 +       return sprintf(buf, "%d\n", 1);
13324 +}
13325 +KERNEL_ATTR_RO(realtime);
13326 +#endif
13327 +
13328  /* whether file capabilities are enabled */
13329  static ssize_t fscaps_show(struct kobject *kobj,
13330                                   struct kobj_attribute *attr, char *buf)
13331 @@ -225,6 +234,9 @@ static struct attribute * kernel_attrs[] = {
13332         &rcu_expedited_attr.attr,
13333         &rcu_normal_attr.attr,
13334  #endif
13335 +#ifdef CONFIG_PREEMPT_RT_FULL
13336 +       &realtime_attr.attr,
13337 +#endif
13338         NULL
13339  };
13340
13341 diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
13342 index 31322a4275cd..c6bba9299d8b 100644
13343 --- a/kernel/locking/Makefile
13344 +++ b/kernel/locking/Makefile
13345 @@ -2,7 +2,7 @@
13346  # and is generally not a function of system call inputs.
13347  KCOV_INSTRUMENT                := n
13348
13349 -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
13350 +obj-y += semaphore.o percpu-rwsem.o
13351
13352  ifdef CONFIG_FUNCTION_TRACER
13353  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
13354 @@ -11,7 +11,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE)
13355  CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE)
13356  endif
13357
13358 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13359 +obj-y += mutex.o
13360  obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
13361 +obj-y += rwsem.o
13362 +endif
13363  obj-$(CONFIG_LOCKDEP) += lockdep.o
13364  ifeq ($(CONFIG_PROC_FS),y)
13365  obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
13366 @@ -25,7 +29,10 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
13367  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
13368  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
13369  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
13370 +ifneq ($(CONFIG_PREEMPT_RT_FULL),y)
13371  obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
13372  obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
13373 +endif
13374 +obj-$(CONFIG_PREEMPT_RT_FULL) += rt.o
13375  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
13376  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
13377 diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
13378 index 951cfcd10b4a..57e0ea72c28a 100644
13379 --- a/kernel/locking/lglock.c
13380 +++ b/kernel/locking/lglock.c
13381 @@ -4,6 +4,15 @@
13382  #include <linux/cpu.h>
13383  #include <linux/string.h>
13384
13385 +#ifndef CONFIG_PREEMPT_RT_FULL
13386 +# define lg_lock_ptr           arch_spinlock_t
13387 +# define lg_do_lock(l)         arch_spin_lock(l)
13388 +# define lg_do_unlock(l)       arch_spin_unlock(l)
13389 +#else
13390 +# define lg_lock_ptr           struct rt_mutex
13391 +# define lg_do_lock(l)         __rt_spin_lock__no_mg(l)
13392 +# define lg_do_unlock(l)       __rt_spin_unlock(l)
13393 +#endif
13394  /*
13395   * Note there is no uninit, so lglocks cannot be defined in
13396   * modules (but it's fine to use them from there)
13397 @@ -12,51 +21,60 @@
13398
13399  void lg_lock_init(struct lglock *lg, char *name)
13400  {
13401 +#ifdef CONFIG_PREEMPT_RT_FULL
13402 +       int i;
13403 +
13404 +       for_each_possible_cpu(i) {
13405 +               struct rt_mutex *lock = per_cpu_ptr(lg->lock, i);
13406 +
13407 +               rt_mutex_init(lock);
13408 +       }
13409 +#endif
13410         LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
13411  }
13412  EXPORT_SYMBOL(lg_lock_init);
13413
13414  void lg_local_lock(struct lglock *lg)
13415  {
13416 -       arch_spinlock_t *lock;
13417 +       lg_lock_ptr *lock;
13418
13419 -       preempt_disable();
13420 +       migrate_disable();
13421         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
13422         lock = this_cpu_ptr(lg->lock);
13423 -       arch_spin_lock(lock);
13424 +       lg_do_lock(lock);
13425  }
13426  EXPORT_SYMBOL(lg_local_lock);
13427
13428  void lg_local_unlock(struct lglock *lg)
13429  {
13430 -       arch_spinlock_t *lock;
13431 +       lg_lock_ptr *lock;
13432
13433         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
13434         lock = this_cpu_ptr(lg->lock);
13435 -       arch_spin_unlock(lock);
13436 -       preempt_enable();
13437 +       lg_do_unlock(lock);
13438 +       migrate_enable();
13439  }
13440  EXPORT_SYMBOL(lg_local_unlock);
13441
13442  void lg_local_lock_cpu(struct lglock *lg, int cpu)
13443  {
13444 -       arch_spinlock_t *lock;
13445 +       lg_lock_ptr *lock;
13446
13447 -       preempt_disable();
13448 +       preempt_disable_nort();
13449         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
13450         lock = per_cpu_ptr(lg->lock, cpu);
13451 -       arch_spin_lock(lock);
13452 +       lg_do_lock(lock);
13453  }
13454  EXPORT_SYMBOL(lg_local_lock_cpu);
13455
13456  void lg_local_unlock_cpu(struct lglock *lg, int cpu)
13457  {
13458 -       arch_spinlock_t *lock;
13459 +       lg_lock_ptr *lock;
13460
13461         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
13462         lock = per_cpu_ptr(lg->lock, cpu);
13463 -       arch_spin_unlock(lock);
13464 -       preempt_enable();
13465 +       lg_do_unlock(lock);
13466 +       preempt_enable_nort();
13467  }
13468  EXPORT_SYMBOL(lg_local_unlock_cpu);
13469
13470 @@ -68,30 +86,30 @@ void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
13471         if (cpu2 < cpu1)
13472                 swap(cpu1, cpu2);
13473
13474 -       preempt_disable();
13475 +       preempt_disable_nort();
13476         lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
13477 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
13478 -       arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
13479 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu1));
13480 +       lg_do_lock(per_cpu_ptr(lg->lock, cpu2));
13481  }
13482
13483  void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
13484  {
13485         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
13486 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
13487 -       arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
13488 -       preempt_enable();
13489 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu1));
13490 +       lg_do_unlock(per_cpu_ptr(lg->lock, cpu2));
13491 +       preempt_enable_nort();
13492  }
13493
13494  void lg_global_lock(struct lglock *lg)
13495  {
13496         int i;
13497
13498 -       preempt_disable();
13499 +       preempt_disable_nort();
13500         lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
13501         for_each_possible_cpu(i) {
13502 -               arch_spinlock_t *lock;
13503 +               lg_lock_ptr *lock;
13504                 lock = per_cpu_ptr(lg->lock, i);
13505 -               arch_spin_lock(lock);
13506 +               lg_do_lock(lock);
13507         }
13508  }
13509  EXPORT_SYMBOL(lg_global_lock);
13510 @@ -102,10 +120,35 @@ void lg_global_unlock(struct lglock *lg)
13511
13512         lock_release(&lg->lock_dep_map, 1, _RET_IP_);
13513         for_each_possible_cpu(i) {
13514 -               arch_spinlock_t *lock;
13515 +               lg_lock_ptr *lock;
13516                 lock = per_cpu_ptr(lg->lock, i);
13517 -               arch_spin_unlock(lock);
13518 +               lg_do_unlock(lock);
13519         }
13520 -       preempt_enable();
13521 +       preempt_enable_nort();
13522  }
13523  EXPORT_SYMBOL(lg_global_unlock);
13524 +
13525 +#ifdef CONFIG_PREEMPT_RT_FULL
13526 +/*
13527 + * HACK: If you use this, you get to keep the pieces.
13528 + * Used in queue_stop_cpus_work() when stop machinery
13529 + * is called from inactive CPU, so we can't schedule.
13530 + */
13531 +# define lg_do_trylock_relax(l)                        \
13532 +       do {                                    \
13533 +               while (!__rt_spin_trylock(l))   \
13534 +                       cpu_relax();            \
13535 +       } while (0)
13536 +
13537 +void lg_global_trylock_relax(struct lglock *lg)
13538 +{
13539 +       int i;
13540 +
13541 +       lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
13542 +       for_each_possible_cpu(i) {
13543 +               lg_lock_ptr *lock;
13544 +               lock = per_cpu_ptr(lg->lock, i);
13545 +               lg_do_trylock_relax(lock);
13546 +       }
13547 +}
13548 +#endif
13549 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
13550 index 589d763a49b3..4b48c4bfb60c 100644
13551 --- a/kernel/locking/lockdep.c
13552 +++ b/kernel/locking/lockdep.c
13553 @@ -3686,6 +3686,7 @@ static void check_flags(unsigned long flags)
13554                 }
13555         }
13556
13557 +#ifndef CONFIG_PREEMPT_RT_FULL
13558         /*
13559          * We dont accurately track softirq state in e.g.
13560          * hardirq contexts (such as on 4KSTACKS), so only
13561 @@ -3700,6 +3701,7 @@ static void check_flags(unsigned long flags)
13562                         DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
13563                 }
13564         }
13565 +#endif
13566
13567         if (!debug_locks)
13568                 print_irqtrace_events(current);
13569 diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
13570 index f8c5af52a131..788068773e61 100644
13571 --- a/kernel/locking/locktorture.c
13572 +++ b/kernel/locking/locktorture.c
13573 @@ -26,7 +26,6 @@
13574  #include <linux/kthread.h>
13575  #include <linux/sched/rt.h>
13576  #include <linux/spinlock.h>
13577 -#include <linux/rwlock.h>
13578  #include <linux/mutex.h>
13579  #include <linux/rwsem.h>
13580  #include <linux/smp.h>
13581 diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
13582 new file mode 100644
13583 index 000000000000..665754c00e1e
13584 --- /dev/null
13585 +++ b/kernel/locking/rt.c
13586 @@ -0,0 +1,498 @@
13587 +/*
13588 + * kernel/rt.c
13589 + *
13590 + * Real-Time Preemption Support
13591 + *
13592 + * started by Ingo Molnar:
13593 + *
13594 + *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
13595 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13596 + *
13597 + * historic credit for proving that Linux spinlocks can be implemented via
13598 + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
13599 + * and others) who prototyped it on 2.4 and did lots of comparative
13600 + * research and analysis; TimeSys, for proving that you can implement a
13601 + * fully preemptible kernel via the use of IRQ threading and mutexes;
13602 + * Bill Huey for persuasively arguing on lkml that the mutex model is the
13603 + * right one; and to MontaVista, who ported pmutexes to 2.6.
13604 + *
13605 + * This code is a from-scratch implementation and is not based on pmutexes,
13606 + * but the idea of converting spinlocks to mutexes is used here too.
13607 + *
13608 + * lock debugging, locking tree, deadlock detection:
13609 + *
13610 + *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
13611 + *  Released under the General Public License (GPL).
13612 + *
13613 + * Includes portions of the generic R/W semaphore implementation from:
13614 + *
13615 + *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
13616 + *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
13617 + *  - Derived also from comments by Linus
13618 + *
13619 + * Pending ownership of locks and ownership stealing:
13620 + *
13621 + *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
13622 + *
13623 + *   (also by Steven Rostedt)
13624 + *    - Converted single pi_lock to individual task locks.
13625 + *
13626 + * By Esben Nielsen:
13627 + *    Doing priority inheritance with help of the scheduler.
13628 + *
13629 + *  Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
13630 + *  - major rework based on Esben Nielsens initial patch
13631 + *  - replaced thread_info references by task_struct refs
13632 + *  - removed task->pending_owner dependency
13633 + *  - BKL drop/reacquire for semaphore style locks to avoid deadlocks
13634 + *    in the scheduler return path as discussed with Steven Rostedt
13635 + *
13636 + *  Copyright (C) 2006, Kihon Technologies Inc.
13637 + *    Steven Rostedt <rostedt@goodmis.org>
13638 + *  - debugged and patched Thomas Gleixner's rework.
13639 + *  - added back the cmpxchg to the rework.
13640 + *  - turned atomic require back on for SMP.
13641 + */
13642 +
13643 +#include <linux/spinlock.h>
13644 +#include <linux/rtmutex.h>
13645 +#include <linux/sched.h>
13646 +#include <linux/delay.h>
13647 +#include <linux/module.h>
13648 +#include <linux/kallsyms.h>
13649 +#include <linux/syscalls.h>
13650 +#include <linux/interrupt.h>
13651 +#include <linux/plist.h>
13652 +#include <linux/fs.h>
13653 +#include <linux/futex.h>
13654 +#include <linux/hrtimer.h>
13655 +
13656 +#include "rtmutex_common.h"
13657 +
13658 +/*
13659 + * struct mutex functions
13660 + */
13661 +void __mutex_do_init(struct mutex *mutex, const char *name,
13662 +                    struct lock_class_key *key)
13663 +{
13664 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13665 +       /*
13666 +        * Make sure we are not reinitializing a held lock:
13667 +        */
13668 +       debug_check_no_locks_freed((void *)mutex, sizeof(*mutex));
13669 +       lockdep_init_map(&mutex->dep_map, name, key, 0);
13670 +#endif
13671 +       mutex->lock.save_state = 0;
13672 +}
13673 +EXPORT_SYMBOL(__mutex_do_init);
13674 +
13675 +void __lockfunc _mutex_lock(struct mutex *lock)
13676 +{
13677 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13678 +       rt_mutex_lock(&lock->lock);
13679 +}
13680 +EXPORT_SYMBOL(_mutex_lock);
13681 +
13682 +int __lockfunc _mutex_lock_interruptible(struct mutex *lock)
13683 +{
13684 +       int ret;
13685 +
13686 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13687 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13688 +       if (ret)
13689 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13690 +       return ret;
13691 +}
13692 +EXPORT_SYMBOL(_mutex_lock_interruptible);
13693 +
13694 +int __lockfunc _mutex_lock_killable(struct mutex *lock)
13695 +{
13696 +       int ret;
13697 +
13698 +       mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
13699 +       ret = rt_mutex_lock_killable(&lock->lock);
13700 +       if (ret)
13701 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13702 +       return ret;
13703 +}
13704 +EXPORT_SYMBOL(_mutex_lock_killable);
13705 +
13706 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13707 +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass)
13708 +{
13709 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13710 +       rt_mutex_lock(&lock->lock);
13711 +}
13712 +EXPORT_SYMBOL(_mutex_lock_nested);
13713 +
13714 +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
13715 +{
13716 +       mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_);
13717 +       rt_mutex_lock(&lock->lock);
13718 +}
13719 +EXPORT_SYMBOL(_mutex_lock_nest_lock);
13720 +
13721 +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass)
13722 +{
13723 +       int ret;
13724 +
13725 +       mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_);
13726 +       ret = rt_mutex_lock_interruptible(&lock->lock);
13727 +       if (ret)
13728 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13729 +       return ret;
13730 +}
13731 +EXPORT_SYMBOL(_mutex_lock_interruptible_nested);
13732 +
13733 +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass)
13734 +{
13735 +       int ret;
13736 +
13737 +       mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
13738 +       ret = rt_mutex_lock_killable(&lock->lock);
13739 +       if (ret)
13740 +               mutex_release(&lock->dep_map, 1, _RET_IP_);
13741 +       return ret;
13742 +}
13743 +EXPORT_SYMBOL(_mutex_lock_killable_nested);
13744 +#endif
13745 +
13746 +int __lockfunc _mutex_trylock(struct mutex *lock)
13747 +{
13748 +       int ret = rt_mutex_trylock(&lock->lock);
13749 +
13750 +       if (ret)
13751 +               mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
13752 +
13753 +       return ret;
13754 +}
13755 +EXPORT_SYMBOL(_mutex_trylock);
13756 +
13757 +void __lockfunc _mutex_unlock(struct mutex *lock)
13758 +{
13759 +       mutex_release(&lock->dep_map, 1, _RET_IP_);
13760 +       rt_mutex_unlock(&lock->lock);
13761 +}
13762 +EXPORT_SYMBOL(_mutex_unlock);
13763 +
13764 +/*
13765 + * rwlock_t functions
13766 + */
13767 +int __lockfunc rt_write_trylock(rwlock_t *rwlock)
13768 +{
13769 +       int ret;
13770 +
13771 +       migrate_disable();
13772 +       ret = rt_mutex_trylock(&rwlock->lock);
13773 +       if (ret)
13774 +               rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13775 +       else
13776 +               migrate_enable();
13777 +
13778 +       return ret;
13779 +}
13780 +EXPORT_SYMBOL(rt_write_trylock);
13781 +
13782 +int __lockfunc rt_write_trylock_irqsave(rwlock_t *rwlock, unsigned long *flags)
13783 +{
13784 +       int ret;
13785 +
13786 +       *flags = 0;
13787 +       ret = rt_write_trylock(rwlock);
13788 +       return ret;
13789 +}
13790 +EXPORT_SYMBOL(rt_write_trylock_irqsave);
13791 +
13792 +int __lockfunc rt_read_trylock(rwlock_t *rwlock)
13793 +{
13794 +       struct rt_mutex *lock = &rwlock->lock;
13795 +       int ret = 1;
13796 +
13797 +       /*
13798 +        * recursive read locks succeed when current owns the lock,
13799 +        * but not when read_depth == 0 which means that the lock is
13800 +        * write locked.
13801 +        */
13802 +       if (rt_mutex_owner(lock) != current) {
13803 +               migrate_disable();
13804 +               ret = rt_mutex_trylock(lock);
13805 +               if (ret)
13806 +                       rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_);
13807 +               else
13808 +                       migrate_enable();
13809 +
13810 +       } else if (!rwlock->read_depth) {
13811 +               ret = 0;
13812 +       }
13813 +
13814 +       if (ret)
13815 +               rwlock->read_depth++;
13816 +
13817 +       return ret;
13818 +}
13819 +EXPORT_SYMBOL(rt_read_trylock);
13820 +
13821 +void __lockfunc rt_write_lock(rwlock_t *rwlock)
13822 +{
13823 +       rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13824 +       __rt_spin_lock(&rwlock->lock);
13825 +}
13826 +EXPORT_SYMBOL(rt_write_lock);
13827 +
13828 +void __lockfunc rt_read_lock(rwlock_t *rwlock)
13829 +{
13830 +       struct rt_mutex *lock = &rwlock->lock;
13831 +
13832 +
13833 +       /*
13834 +        * recursive read locks succeed when current owns the lock
13835 +        */
13836 +       if (rt_mutex_owner(lock) != current) {
13837 +               rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
13838 +               __rt_spin_lock(lock);
13839 +       }
13840 +       rwlock->read_depth++;
13841 +}
13842 +
13843 +EXPORT_SYMBOL(rt_read_lock);
13844 +
13845 +void __lockfunc rt_write_unlock(rwlock_t *rwlock)
13846 +{
13847 +       /* NOTE: we always pass in '1' for nested, for simplicity */
13848 +       rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13849 +       __rt_spin_unlock(&rwlock->lock);
13850 +       migrate_enable();
13851 +}
13852 +EXPORT_SYMBOL(rt_write_unlock);
13853 +
13854 +void __lockfunc rt_read_unlock(rwlock_t *rwlock)
13855 +{
13856 +       /* Release the lock only when read_depth is down to 0 */
13857 +       if (--rwlock->read_depth == 0) {
13858 +               rwlock_release(&rwlock->dep_map, 1, _RET_IP_);
13859 +               __rt_spin_unlock(&rwlock->lock);
13860 +               migrate_enable();
13861 +       }
13862 +}
13863 +EXPORT_SYMBOL(rt_read_unlock);
13864 +
13865 +unsigned long __lockfunc rt_write_lock_irqsave(rwlock_t *rwlock)
13866 +{
13867 +       rt_write_lock(rwlock);
13868 +
13869 +       return 0;
13870 +}
13871 +EXPORT_SYMBOL(rt_write_lock_irqsave);
13872 +
13873 +unsigned long __lockfunc rt_read_lock_irqsave(rwlock_t *rwlock)
13874 +{
13875 +       rt_read_lock(rwlock);
13876 +
13877 +       return 0;
13878 +}
13879 +EXPORT_SYMBOL(rt_read_lock_irqsave);
13880 +
13881 +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key)
13882 +{
13883 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
13884 +       /*
13885 +        * Make sure we are not reinitializing a held lock:
13886 +        */
13887 +       debug_check_no_locks_freed((void *)rwlock, sizeof(*rwlock));
13888 +       lockdep_init_map(&rwlock->dep_map, name, key, 0);
13889 +#endif
13890 +       rwlock->lock.save_state = 1;
13891 +       rwlock->read_depth = 0;
13892 +}
13893 +EXPORT_SYMBOL(__rt_rwlock_init);
13894 +
13895 +/*
13896 + * rw_semaphores
13897 + */
13898 +
13899 +void  rt_up_write(struct rw_semaphore *rwsem)
13900 +{
13901 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
13902 +       rt_mutex_unlock(&rwsem->lock);
13903 +}
13904 +EXPORT_SYMBOL(rt_up_write);
13905 +
13906 +void __rt_up_read(struct rw_semaphore *rwsem)
13907 +{
13908 +       if (--rwsem->read_depth == 0)
13909 +               rt_mutex_unlock(&rwsem->lock);
13910 +}
13911 +
13912 +void  rt_up_read(struct rw_semaphore *rwsem)
13913 +{
13914 +       rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
13915 +       __rt_up_read(rwsem);
13916 +}
13917 +EXPORT_SYMBOL(rt_up_read);
13918 +
13919 +/*
13920 + * downgrade a write lock into a read lock
13921 + * - just wake up any readers at the front of the queue
13922 + */
13923 +void  rt_downgrade_write(struct rw_semaphore *rwsem)
13924 +{
13925 +       BUG_ON(rt_mutex_owner(&rwsem->lock) != current);
13926 +       rwsem->read_depth = 1;
13927 +}
13928 +EXPORT_SYMBOL(rt_downgrade_write);
13929 +
13930 +int  rt_down_write_trylock(struct rw_semaphore *rwsem)
13931 +{
13932 +       int ret = rt_mutex_trylock(&rwsem->lock);
13933 +
13934 +       if (ret)
13935 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
13936 +       return ret;
13937 +}
13938 +EXPORT_SYMBOL(rt_down_write_trylock);
13939 +
13940 +void  rt_down_write(struct rw_semaphore *rwsem)
13941 +{
13942 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
13943 +       rt_mutex_lock(&rwsem->lock);
13944 +}
13945 +EXPORT_SYMBOL(rt_down_write);
13946 +
13947 +int rt_down_write_killable(struct rw_semaphore *rwsem)
13948 +{
13949 +       int ret;
13950 +
13951 +       rwsem_acquire(&rwsem->dep_map, 0, 0, _RET_IP_);
13952 +       ret = rt_mutex_lock_killable(&rwsem->lock);
13953 +       if (ret)
13954 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
13955 +       return ret;
13956 +}
13957 +EXPORT_SYMBOL(rt_down_write_killable);
13958 +
13959 +int rt_down_write_killable_nested(struct rw_semaphore *rwsem, int subclass)
13960 +{
13961 +       int ret;
13962 +
13963 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
13964 +       ret = rt_mutex_lock_killable(&rwsem->lock);
13965 +       if (ret)
13966 +               rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
13967 +       return ret;
13968 +}
13969 +EXPORT_SYMBOL(rt_down_write_killable_nested);
13970 +
13971 +void  rt_down_write_nested(struct rw_semaphore *rwsem, int subclass)
13972 +{
13973 +       rwsem_acquire(&rwsem->dep_map, subclass, 0, _RET_IP_);
13974 +       rt_mutex_lock(&rwsem->lock);
13975 +}
13976 +EXPORT_SYMBOL(rt_down_write_nested);
13977 +
13978 +void rt_down_write_nested_lock(struct rw_semaphore *rwsem,
13979 +                              struct lockdep_map *nest)
13980 +{
13981 +       rwsem_acquire_nest(&rwsem->dep_map, 0, 0, nest, _RET_IP_);
13982 +       rt_mutex_lock(&rwsem->lock);
13983 +}
13984 +EXPORT_SYMBOL(rt_down_write_nested_lock);
13985 +
13986 +int rt__down_read_trylock(struct rw_semaphore *rwsem)
13987 +{
13988 +       struct rt_mutex *lock = &rwsem->lock;
13989 +       int ret = 1;
13990 +
13991 +       /*
13992 +        * recursive read locks succeed when current owns the rwsem,
13993 +        * but not when read_depth == 0 which means that the rwsem is
13994 +        * write locked.
13995 +        */
13996 +       if (rt_mutex_owner(lock) != current)
13997 +               ret = rt_mutex_trylock(&rwsem->lock);
13998 +       else if (!rwsem->read_depth)
13999 +               ret = 0;
14000 +
14001 +       if (ret)
14002 +               rwsem->read_depth++;
14003 +       return ret;
14004 +
14005 +}
14006 +
14007 +int  rt_down_read_trylock(struct rw_semaphore *rwsem)
14008 +{
14009 +       int ret;
14010 +
14011 +       ret = rt__down_read_trylock(rwsem);
14012 +       if (ret)
14013 +               rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
14014 +
14015 +       return ret;
14016 +}
14017 +EXPORT_SYMBOL(rt_down_read_trylock);
14018 +
14019 +void rt__down_read(struct rw_semaphore *rwsem)
14020 +{
14021 +       struct rt_mutex *lock = &rwsem->lock;
14022 +
14023 +       if (rt_mutex_owner(lock) != current)
14024 +               rt_mutex_lock(&rwsem->lock);
14025 +       rwsem->read_depth++;
14026 +}
14027 +EXPORT_SYMBOL(rt__down_read);
14028 +
14029 +static void __rt_down_read(struct rw_semaphore *rwsem, int subclass)
14030 +{
14031 +       rwsem_acquire_read(&rwsem->dep_map, subclass, 0, _RET_IP_);
14032 +       rt__down_read(rwsem);
14033 +}
14034 +
14035 +void  rt_down_read(struct rw_semaphore *rwsem)
14036 +{
14037 +       __rt_down_read(rwsem, 0);
14038 +}
14039 +EXPORT_SYMBOL(rt_down_read);
14040 +
14041 +void  rt_down_read_nested(struct rw_semaphore *rwsem, int subclass)
14042 +{
14043 +       __rt_down_read(rwsem, subclass);
14044 +}
14045 +EXPORT_SYMBOL(rt_down_read_nested);
14046 +
14047 +void  __rt_rwsem_init(struct rw_semaphore *rwsem, const char *name,
14048 +                             struct lock_class_key *key)
14049 +{
14050 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14051 +       /*
14052 +        * Make sure we are not reinitializing a held lock:
14053 +        */
14054 +       debug_check_no_locks_freed((void *)rwsem, sizeof(*rwsem));
14055 +       lockdep_init_map(&rwsem->dep_map, name, key, 0);
14056 +#endif
14057 +       rwsem->read_depth = 0;
14058 +       rwsem->lock.save_state = 0;
14059 +}
14060 +EXPORT_SYMBOL(__rt_rwsem_init);
14061 +
14062 +/**
14063 + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
14064 + * @cnt: the atomic which we are to dec
14065 + * @lock: the mutex to return holding if we dec to 0
14066 + *
14067 + * return true and hold lock if we dec to 0, return false otherwise
14068 + */
14069 +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
14070 +{
14071 +       /* dec if we can't possibly hit 0 */
14072 +       if (atomic_add_unless(cnt, -1, 1))
14073 +               return 0;
14074 +       /* we might hit 0, so take the lock */
14075 +       mutex_lock(lock);
14076 +       if (!atomic_dec_and_test(cnt)) {
14077 +               /* when we actually did the dec, we didn't hit 0 */
14078 +               mutex_unlock(lock);
14079 +               return 0;
14080 +       }
14081 +       /* we hit 0, and we hold the lock */
14082 +       return 1;
14083 +}
14084 +EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
14085 diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
14086 index 1ec0f48962b3..2576f7ccf8e2 100644
14087 --- a/kernel/locking/rtmutex.c
14088 +++ b/kernel/locking/rtmutex.c
14089 @@ -7,6 +7,11 @@
14090   *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
14091   *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
14092   *  Copyright (C) 2006 Esben Nielsen
14093 + *  Adaptive Spinlocks:
14094 + *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
14095 + *                                  and Peter Morreale,
14096 + * Adaptive Spinlocks simplification:
14097 + *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
14098   *
14099   *  See Documentation/locking/rt-mutex-design.txt for details.
14100   */
14101 @@ -16,6 +21,7 @@
14102  #include <linux/sched/rt.h>
14103  #include <linux/sched/deadline.h>
14104  #include <linux/timer.h>
14105 +#include <linux/ww_mutex.h>
14106
14107  #include "rtmutex_common.h"
14108
14109 @@ -69,6 +75,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
14110                 clear_rt_mutex_waiters(lock);
14111  }
14112
14113 +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
14114 +{
14115 +       return waiter && waiter != PI_WAKEUP_INPROGRESS &&
14116 +               waiter != PI_REQUEUE_INPROGRESS;
14117 +}
14118 +
14119  /*
14120   * We can speed up the acquire/release, if there's no debugging state to be
14121   * set up.
14122 @@ -350,6 +362,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
14123         return debug_rt_mutex_detect_deadlock(waiter, chwalk);
14124  }
14125
14126 +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter)
14127 +{
14128 +       if (waiter->savestate)
14129 +               wake_up_lock_sleeper(waiter->task);
14130 +       else
14131 +               wake_up_process(waiter->task);
14132 +}
14133 +
14134  /*
14135   * Max number of times we'll walk the boosting chain:
14136   */
14137 @@ -357,7 +377,8 @@ int max_lock_depth = 1024;
14138
14139  static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
14140  {
14141 -       return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
14142 +       return rt_mutex_real_waiter(p->pi_blocked_on) ?
14143 +               p->pi_blocked_on->lock : NULL;
14144  }
14145
14146  /*
14147 @@ -493,7 +514,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14148          * reached or the state of the chain has changed while we
14149          * dropped the locks.
14150          */
14151 -       if (!waiter)
14152 +       if (!rt_mutex_real_waiter(waiter))
14153                 goto out_unlock_pi;
14154
14155         /*
14156 @@ -655,13 +676,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14157          * follow here. This is the end of the chain we are walking.
14158          */
14159         if (!rt_mutex_owner(lock)) {
14160 +               struct rt_mutex_waiter *lock_top_waiter;
14161 +
14162                 /*
14163                  * If the requeue [7] above changed the top waiter,
14164                  * then we need to wake the new top waiter up to try
14165                  * to get the lock.
14166                  */
14167 -               if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
14168 -                       wake_up_process(rt_mutex_top_waiter(lock)->task);
14169 +               lock_top_waiter = rt_mutex_top_waiter(lock);
14170 +               if (prerequeue_top_waiter != lock_top_waiter)
14171 +                       rt_mutex_wake_waiter(lock_top_waiter);
14172                 raw_spin_unlock_irq(&lock->wait_lock);
14173                 return 0;
14174         }
14175 @@ -754,6 +778,25 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14176         return ret;
14177  }
14178
14179 +
14180 +#define STEAL_NORMAL  0
14181 +#define STEAL_LATERAL 1
14182 +
14183 +/*
14184 + * Note that RT tasks are excluded from lateral-steals to prevent the
14185 + * introduction of an unbounded latency
14186 + */
14187 +static inline int lock_is_stealable(struct task_struct *task,
14188 +                                   struct task_struct *pendowner, int mode)
14189 +{
14190 +    if (mode == STEAL_NORMAL || rt_task(task)) {
14191 +           if (task->prio >= pendowner->prio)
14192 +                   return 0;
14193 +    } else if (task->prio > pendowner->prio)
14194 +           return 0;
14195 +    return 1;
14196 +}
14197 +
14198  /*
14199   * Try to take an rt-mutex
14200   *
14201 @@ -764,8 +807,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
14202   * @waiter: The waiter that is queued to the lock's wait tree if the
14203   *         callsite called task_blocked_on_lock(), otherwise NULL
14204   */
14205 -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14206 -                               struct rt_mutex_waiter *waiter)
14207 +static int __try_to_take_rt_mutex(struct rt_mutex *lock,
14208 +                                 struct task_struct *task,
14209 +                                 struct rt_mutex_waiter *waiter, int mode)
14210  {
14211         /*
14212          * Before testing whether we can acquire @lock, we set the
14213 @@ -802,8 +846,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14214                  * If waiter is not the highest priority waiter of
14215                  * @lock, give up.
14216                  */
14217 -               if (waiter != rt_mutex_top_waiter(lock))
14218 +               if (waiter != rt_mutex_top_waiter(lock)) {
14219 +                       /* XXX lock_is_stealable() ? */
14220                         return 0;
14221 +               }
14222
14223                 /*
14224                  * We can acquire the lock. Remove the waiter from the
14225 @@ -821,14 +867,10 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14226                  * not need to be dequeued.
14227                  */
14228                 if (rt_mutex_has_waiters(lock)) {
14229 -                       /*
14230 -                        * If @task->prio is greater than or equal to
14231 -                        * the top waiter priority (kernel view),
14232 -                        * @task lost.
14233 -                        */
14234 -                       if (task->prio >= rt_mutex_top_waiter(lock)->prio)
14235 -                               return 0;
14236 +                       struct task_struct *pown = rt_mutex_top_waiter(lock)->task;
14237
14238 +                       if (task != pown && !lock_is_stealable(task, pown, mode))
14239 +                               return 0;
14240                         /*
14241                          * The current top waiter stays enqueued. We
14242                          * don't have to change anything in the lock
14243 @@ -877,6 +919,438 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14244         return 1;
14245  }
14246
14247 +#ifdef CONFIG_PREEMPT_RT_FULL
14248 +/*
14249 + * preemptible spin_lock functions:
14250 + */
14251 +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
14252 +                                        void  (*slowfn)(struct rt_mutex *lock,
14253 +                                                        bool mg_off),
14254 +                                        bool do_mig_dis)
14255 +{
14256 +       might_sleep_no_state_check();
14257 +
14258 +       if (do_mig_dis)
14259 +               migrate_disable();
14260 +
14261 +       if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
14262 +               rt_mutex_deadlock_account_lock(lock, current);
14263 +       else
14264 +               slowfn(lock, do_mig_dis);
14265 +}
14266 +
14267 +static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
14268 +                                         int (*slowfn)(struct rt_mutex *lock))
14269 +{
14270 +       if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
14271 +               rt_mutex_deadlock_account_unlock(current);
14272 +               return 0;
14273 +       }
14274 +       return slowfn(lock);
14275 +}
14276 +#ifdef CONFIG_SMP
14277 +/*
14278 + * Note that owner is a speculative pointer and dereferencing relies
14279 + * on rcu_read_lock() and the check against the lock owner.
14280 + */
14281 +static int adaptive_wait(struct rt_mutex *lock,
14282 +                        struct task_struct *owner)
14283 +{
14284 +       int res = 0;
14285 +
14286 +       rcu_read_lock();
14287 +       for (;;) {
14288 +               if (owner != rt_mutex_owner(lock))
14289 +                       break;
14290 +               /*
14291 +                * Ensure that owner->on_cpu is dereferenced _after_
14292 +                * checking the above to be valid.
14293 +                */
14294 +               barrier();
14295 +               if (!owner->on_cpu) {
14296 +                       res = 1;
14297 +                       break;
14298 +               }
14299 +               cpu_relax();
14300 +       }
14301 +       rcu_read_unlock();
14302 +       return res;
14303 +}
14304 +#else
14305 +static int adaptive_wait(struct rt_mutex *lock,
14306 +                        struct task_struct *orig_owner)
14307 +{
14308 +       return 1;
14309 +}
14310 +#endif
14311 +
14312 +static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14313 +                                  struct rt_mutex_waiter *waiter,
14314 +                                  struct task_struct *task,
14315 +                                  enum rtmutex_chainwalk chwalk);
14316 +/*
14317 + * Slow path lock function spin_lock style: this variant is very
14318 + * careful not to miss any non-lock wakeups.
14319 + *
14320 + * We store the current state under p->pi_lock in p->saved_state and
14321 + * the try_to_wake_up() code handles this accordingly.
14322 + */
14323 +static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
14324 +                                                   bool mg_off)
14325 +{
14326 +       struct task_struct *lock_owner, *self = current;
14327 +       struct rt_mutex_waiter waiter, *top_waiter;
14328 +       unsigned long flags;
14329 +       int ret;
14330 +
14331 +       rt_mutex_init_waiter(&waiter, true);
14332 +
14333 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14334 +
14335 +       if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) {
14336 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14337 +               return;
14338 +       }
14339 +
14340 +       BUG_ON(rt_mutex_owner(lock) == self);
14341 +
14342 +       /*
14343 +        * We save whatever state the task is in and we'll restore it
14344 +        * after acquiring the lock taking real wakeups into account
14345 +        * as well. We are serialized via pi_lock against wakeups. See
14346 +        * try_to_wake_up().
14347 +        */
14348 +       raw_spin_lock(&self->pi_lock);
14349 +       self->saved_state = self->state;
14350 +       __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14351 +       raw_spin_unlock(&self->pi_lock);
14352 +
14353 +       ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
14354 +       BUG_ON(ret);
14355 +
14356 +       for (;;) {
14357 +               /* Try to acquire the lock again. */
14358 +               if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
14359 +                       break;
14360 +
14361 +               top_waiter = rt_mutex_top_waiter(lock);
14362 +               lock_owner = rt_mutex_owner(lock);
14363 +
14364 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14365 +
14366 +               debug_rt_mutex_print_deadlock(&waiter);
14367 +
14368 +               if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
14369 +                       if (mg_off)
14370 +                               migrate_enable();
14371 +                       schedule();
14372 +                       if (mg_off)
14373 +                               migrate_disable();
14374 +               }
14375 +
14376 +               raw_spin_lock_irqsave(&lock->wait_lock, flags);
14377 +
14378 +               raw_spin_lock(&self->pi_lock);
14379 +               __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
14380 +               raw_spin_unlock(&self->pi_lock);
14381 +       }
14382 +
14383 +       /*
14384 +        * Restore the task state to current->saved_state. We set it
14385 +        * to the original state above and the try_to_wake_up() code
14386 +        * has possibly updated it when a real (non-rtmutex) wakeup
14387 +        * happened while we were blocked. Clear saved_state so
14388 +        * try_to_wakeup() does not get confused.
14389 +        */
14390 +       raw_spin_lock(&self->pi_lock);
14391 +       __set_current_state_no_track(self->saved_state);
14392 +       self->saved_state = TASK_RUNNING;
14393 +       raw_spin_unlock(&self->pi_lock);
14394 +
14395 +       /*
14396 +        * try_to_take_rt_mutex() sets the waiter bit
14397 +        * unconditionally. We might have to fix that up:
14398 +        */
14399 +       fixup_rt_mutex_waiters(lock);
14400 +
14401 +       BUG_ON(rt_mutex_has_waiters(lock) && &waiter == rt_mutex_top_waiter(lock));
14402 +       BUG_ON(!RB_EMPTY_NODE(&waiter.tree_entry));
14403 +
14404 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14405 +
14406 +       debug_rt_mutex_free_waiter(&waiter);
14407 +}
14408 +
14409 +static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14410 +                                   struct wake_q_head *wake_sleeper_q,
14411 +                                   struct rt_mutex *lock);
14412 +/*
14413 + * Slow path to release a rt_mutex spin_lock style
14414 + */
14415 +static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
14416 +{
14417 +       unsigned long flags;
14418 +       WAKE_Q(wake_q);
14419 +       WAKE_Q(wake_sleeper_q);
14420 +
14421 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14422 +
14423 +       debug_rt_mutex_unlock(lock);
14424 +
14425 +       rt_mutex_deadlock_account_unlock(current);
14426 +
14427 +       if (!rt_mutex_has_waiters(lock)) {
14428 +               lock->owner = NULL;
14429 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14430 +               return 0;
14431 +       }
14432 +
14433 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
14434 +
14435 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14436 +       wake_up_q(&wake_q);
14437 +       wake_up_q_sleeper(&wake_sleeper_q);
14438 +
14439 +       /* Undo pi boosting.when necessary */
14440 +       rt_mutex_adjust_prio(current);
14441 +       return 0;
14442 +}
14443 +
14444 +static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
14445 +{
14446 +       unsigned long flags;
14447 +       WAKE_Q(wake_q);
14448 +       WAKE_Q(wake_sleeper_q);
14449 +
14450 +       raw_spin_lock_irqsave(&lock->wait_lock, flags);
14451 +
14452 +       debug_rt_mutex_unlock(lock);
14453 +
14454 +       rt_mutex_deadlock_account_unlock(current);
14455 +
14456 +       if (!rt_mutex_has_waiters(lock)) {
14457 +               lock->owner = NULL;
14458 +               raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14459 +               return 0;
14460 +       }
14461 +
14462 +       mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
14463 +
14464 +       raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14465 +       wake_up_q(&wake_q);
14466 +       wake_up_q_sleeper(&wake_sleeper_q);
14467 +       return 1;
14468 +}
14469 +
14470 +void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
14471 +{
14472 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
14473 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14474 +}
14475 +EXPORT_SYMBOL(rt_spin_lock__no_mg);
14476 +
14477 +void __lockfunc rt_spin_lock(spinlock_t *lock)
14478 +{
14479 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
14480 +       spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
14481 +}
14482 +EXPORT_SYMBOL(rt_spin_lock);
14483 +
14484 +void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
14485 +{
14486 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
14487 +}
14488 +EXPORT_SYMBOL(__rt_spin_lock);
14489 +
14490 +void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
14491 +{
14492 +       rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
14493 +}
14494 +EXPORT_SYMBOL(__rt_spin_lock__no_mg);
14495 +
14496 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14497 +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
14498 +{
14499 +       spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
14500 +       rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
14501 +}
14502 +EXPORT_SYMBOL(rt_spin_lock_nested);
14503 +#endif
14504 +
14505 +void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
14506 +{
14507 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14508 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14509 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
14510 +}
14511 +EXPORT_SYMBOL(rt_spin_unlock__no_mg);
14512 +
14513 +void __lockfunc rt_spin_unlock(spinlock_t *lock)
14514 +{
14515 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14516 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14517 +       rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
14518 +       migrate_enable();
14519 +}
14520 +EXPORT_SYMBOL(rt_spin_unlock);
14521 +
14522 +int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
14523 +{
14524 +       int ret;
14525 +
14526 +       /* NOTE: we always pass in '1' for nested, for simplicity */
14527 +       spin_release(&lock->dep_map, 1, _RET_IP_);
14528 +       ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
14529 +       migrate_enable();
14530 +       return ret;
14531 +}
14532 +
14533 +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
14534 +{
14535 +       rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
14536 +}
14537 +EXPORT_SYMBOL(__rt_spin_unlock);
14538 +
14539 +/*
14540 + * Wait for the lock to get unlocked: instead of polling for an unlock
14541 + * (like raw spinlocks do), we lock and unlock, to force the kernel to
14542 + * schedule if there's contention:
14543 + */
14544 +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock)
14545 +{
14546 +       spin_lock(lock);
14547 +       spin_unlock(lock);
14548 +}
14549 +EXPORT_SYMBOL(rt_spin_unlock_wait);
14550 +
14551 +int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
14552 +{
14553 +       return rt_mutex_trylock(lock);
14554 +}
14555 +
14556 +int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
14557 +{
14558 +       int ret;
14559 +
14560 +       ret = rt_mutex_trylock(&lock->lock);
14561 +       if (ret)
14562 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14563 +       return ret;
14564 +}
14565 +EXPORT_SYMBOL(rt_spin_trylock__no_mg);
14566 +
14567 +int __lockfunc rt_spin_trylock(spinlock_t *lock)
14568 +{
14569 +       int ret;
14570 +
14571 +       migrate_disable();
14572 +       ret = rt_mutex_trylock(&lock->lock);
14573 +       if (ret)
14574 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14575 +       else
14576 +               migrate_enable();
14577 +       return ret;
14578 +}
14579 +EXPORT_SYMBOL(rt_spin_trylock);
14580 +
14581 +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
14582 +{
14583 +       int ret;
14584 +
14585 +       local_bh_disable();
14586 +       ret = rt_mutex_trylock(&lock->lock);
14587 +       if (ret) {
14588 +               migrate_disable();
14589 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14590 +       } else
14591 +               local_bh_enable();
14592 +       return ret;
14593 +}
14594 +EXPORT_SYMBOL(rt_spin_trylock_bh);
14595 +
14596 +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
14597 +{
14598 +       int ret;
14599 +
14600 +       *flags = 0;
14601 +       ret = rt_mutex_trylock(&lock->lock);
14602 +       if (ret) {
14603 +               migrate_disable();
14604 +               spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
14605 +       }
14606 +       return ret;
14607 +}
14608 +EXPORT_SYMBOL(rt_spin_trylock_irqsave);
14609 +
14610 +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
14611 +{
14612 +       /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
14613 +       if (atomic_add_unless(atomic, -1, 1))
14614 +               return 0;
14615 +       rt_spin_lock(lock);
14616 +       if (atomic_dec_and_test(atomic))
14617 +               return 1;
14618 +       rt_spin_unlock(lock);
14619 +       return 0;
14620 +}
14621 +EXPORT_SYMBOL(atomic_dec_and_spin_lock);
14622 +
14623 +       void
14624 +__rt_spin_lock_init(spinlock_t *lock, char *name, struct lock_class_key *key)
14625 +{
14626 +#ifdef CONFIG_DEBUG_LOCK_ALLOC
14627 +       /*
14628 +        * Make sure we are not reinitializing a held lock:
14629 +        */
14630 +       debug_check_no_locks_freed((void *)lock, sizeof(*lock));
14631 +       lockdep_init_map(&lock->dep_map, name, key, 0);
14632 +#endif
14633 +}
14634 +EXPORT_SYMBOL(__rt_spin_lock_init);
14635 +
14636 +#endif /* PREEMPT_RT_FULL */
14637 +
14638 +#ifdef CONFIG_PREEMPT_RT_FULL
14639 +       static inline int __sched
14640 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
14641 +{
14642 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14643 +       struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
14644 +
14645 +       if (!hold_ctx)
14646 +               return 0;
14647 +
14648 +       if (unlikely(ctx == hold_ctx))
14649 +               return -EALREADY;
14650 +
14651 +       if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
14652 +           (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
14653 +#ifdef CONFIG_DEBUG_MUTEXES
14654 +               DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
14655 +               ctx->contending_lock = ww;
14656 +#endif
14657 +               return -EDEADLK;
14658 +       }
14659 +
14660 +       return 0;
14661 +}
14662 +#else
14663 +       static inline int __sched
14664 +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx)
14665 +{
14666 +       BUG();
14667 +       return 0;
14668 +}
14669 +
14670 +#endif
14671 +
14672 +static inline int
14673 +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
14674 +                    struct rt_mutex_waiter *waiter)
14675 +{
14676 +       return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL);
14677 +}
14678 +
14679  /*
14680   * Task blocks on lock.
14681   *
14682 @@ -907,6 +1381,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14683                 return -EDEADLK;
14684
14685         raw_spin_lock(&task->pi_lock);
14686 +
14687 +       /*
14688 +        * In the case of futex requeue PI, this will be a proxy
14689 +        * lock. The task will wake unaware that it is enqueueed on
14690 +        * this lock. Avoid blocking on two locks and corrupting
14691 +        * pi_blocked_on via the PI_WAKEUP_INPROGRESS
14692 +        * flag. futex_wait_requeue_pi() sets this when it wakes up
14693 +        * before requeue (due to a signal or timeout). Do not enqueue
14694 +        * the task if PI_WAKEUP_INPROGRESS is set.
14695 +        */
14696 +       if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) {
14697 +               raw_spin_unlock(&task->pi_lock);
14698 +               return -EAGAIN;
14699 +       }
14700 +
14701 +       BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
14702 +
14703         __rt_mutex_adjust_prio(task);
14704         waiter->task = task;
14705         waiter->lock = lock;
14706 @@ -930,7 +1421,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14707                 rt_mutex_enqueue_pi(owner, waiter);
14708
14709                 __rt_mutex_adjust_prio(owner);
14710 -               if (owner->pi_blocked_on)
14711 +               if (rt_mutex_real_waiter(owner->pi_blocked_on))
14712                         chain_walk = 1;
14713         } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
14714                 chain_walk = 1;
14715 @@ -972,6 +1463,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
14716   * Called with lock->wait_lock held and interrupts disabled.
14717   */
14718  static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14719 +                                   struct wake_q_head *wake_sleeper_q,
14720                                     struct rt_mutex *lock)
14721  {
14722         struct rt_mutex_waiter *waiter;
14723 @@ -1000,7 +1492,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
14724
14725         raw_spin_unlock(&current->pi_lock);
14726
14727 -       wake_q_add(wake_q, waiter->task);
14728 +       if (waiter->savestate)
14729 +               wake_q_add(wake_sleeper_q, waiter->task);
14730 +       else
14731 +               wake_q_add(wake_q, waiter->task);
14732  }
14733
14734  /*
14735 @@ -1014,7 +1509,7 @@ static void remove_waiter(struct rt_mutex *lock,
14736  {
14737         bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
14738         struct task_struct *owner = rt_mutex_owner(lock);
14739 -       struct rt_mutex *next_lock;
14740 +       struct rt_mutex *next_lock = NULL;
14741
14742         raw_spin_lock(&current->pi_lock);
14743         rt_mutex_dequeue(lock, waiter);
14744 @@ -1038,7 +1533,8 @@ static void remove_waiter(struct rt_mutex *lock,
14745         __rt_mutex_adjust_prio(owner);
14746
14747         /* Store the lock on which owner is blocked or NULL */
14748 -       next_lock = task_blocked_on_lock(owner);
14749 +       if (rt_mutex_real_waiter(owner->pi_blocked_on))
14750 +               next_lock = task_blocked_on_lock(owner);
14751
14752         raw_spin_unlock(&owner->pi_lock);
14753
14754 @@ -1074,17 +1570,17 @@ void rt_mutex_adjust_pi(struct task_struct *task)
14755         raw_spin_lock_irqsave(&task->pi_lock, flags);
14756
14757         waiter = task->pi_blocked_on;
14758 -       if (!waiter || (waiter->prio == task->prio &&
14759 +       if (!rt_mutex_real_waiter(waiter) || (waiter->prio == task->prio &&
14760                         !dl_prio(task->prio))) {
14761                 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14762                 return;
14763         }
14764         next_lock = waiter->lock;
14765 -       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14766
14767         /* gets dropped in rt_mutex_adjust_prio_chain()! */
14768         get_task_struct(task);
14769
14770 +       raw_spin_unlock_irqrestore(&task->pi_lock, flags);
14771         rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
14772                                    next_lock, NULL, task);
14773  }
14774 @@ -1102,7 +1598,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
14775  static int __sched
14776  __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14777                     struct hrtimer_sleeper *timeout,
14778 -                   struct rt_mutex_waiter *waiter)
14779 +                   struct rt_mutex_waiter *waiter,
14780 +                   struct ww_acquire_ctx *ww_ctx)
14781  {
14782         int ret = 0;
14783
14784 @@ -1125,6 +1622,12 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
14785                                 break;
14786                 }
14787
14788 +               if (ww_ctx && ww_ctx->acquired > 0) {
14789 +                       ret = __mutex_lock_check_stamp(lock, ww_ctx);
14790 +                       if (ret)
14791 +                               break;
14792 +               }
14793 +
14794                 raw_spin_unlock_irq(&lock->wait_lock);
14795
14796                 debug_rt_mutex_print_deadlock(waiter);
14797 @@ -1159,21 +1662,96 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
14798         }
14799  }
14800
14801 +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
14802 +                                                  struct ww_acquire_ctx *ww_ctx)
14803 +{
14804 +#ifdef CONFIG_DEBUG_MUTEXES
14805 +       /*
14806 +        * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
14807 +        * but released with a normal mutex_unlock in this call.
14808 +        *
14809 +        * This should never happen, always use ww_mutex_unlock.
14810 +        */
14811 +       DEBUG_LOCKS_WARN_ON(ww->ctx);
14812 +
14813 +       /*
14814 +        * Not quite done after calling ww_acquire_done() ?
14815 +        */
14816 +       DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
14817 +
14818 +       if (ww_ctx->contending_lock) {
14819 +               /*
14820 +                * After -EDEADLK you tried to
14821 +                * acquire a different ww_mutex? Bad!
14822 +                */
14823 +               DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
14824 +
14825 +               /*
14826 +                * You called ww_mutex_lock after receiving -EDEADLK,
14827 +                * but 'forgot' to unlock everything else first?
14828 +                */
14829 +               DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
14830 +               ww_ctx->contending_lock = NULL;
14831 +       }
14832 +
14833 +       /*
14834 +        * Naughty, using a different class will lead to undefined behavior!
14835 +        */
14836 +       DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
14837 +#endif
14838 +       ww_ctx->acquired++;
14839 +}
14840 +
14841 +#ifdef CONFIG_PREEMPT_RT_FULL
14842 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14843 +                                 struct ww_acquire_ctx *ww_ctx)
14844 +{
14845 +       struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock);
14846 +       struct rt_mutex_waiter *waiter, *n;
14847 +
14848 +       /*
14849 +        * This branch gets optimized out for the common case,
14850 +        * and is only important for ww_mutex_lock.
14851 +        */
14852 +       ww_mutex_lock_acquired(ww, ww_ctx);
14853 +       ww->ctx = ww_ctx;
14854 +
14855 +       /*
14856 +        * Give any possible sleeping processes the chance to wake up,
14857 +        * so they can recheck if they have to back off.
14858 +        */
14859 +       rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters,
14860 +                                            tree_entry) {
14861 +               /* XXX debug rt mutex waiter wakeup */
14862 +
14863 +               BUG_ON(waiter->lock != lock);
14864 +               rt_mutex_wake_waiter(waiter);
14865 +       }
14866 +}
14867 +
14868 +#else
14869 +
14870 +static void ww_mutex_account_lock(struct rt_mutex *lock,
14871 +                                 struct ww_acquire_ctx *ww_ctx)
14872 +{
14873 +       BUG();
14874 +}
14875 +#endif
14876 +
14877  /*
14878   * Slow path lock function:
14879   */
14880  static int __sched
14881  rt_mutex_slowlock(struct rt_mutex *lock, int state,
14882                   struct hrtimer_sleeper *timeout,
14883 -                 enum rtmutex_chainwalk chwalk)
14884 +                 enum rtmutex_chainwalk chwalk,
14885 +                 struct ww_acquire_ctx *ww_ctx)
14886  {
14887         struct rt_mutex_waiter waiter;
14888         unsigned long flags;
14889         int ret = 0;
14890
14891 -       debug_rt_mutex_init_waiter(&waiter);
14892 -       RB_CLEAR_NODE(&waiter.pi_tree_entry);
14893 -       RB_CLEAR_NODE(&waiter.tree_entry);
14894 +       rt_mutex_init_waiter(&waiter, false);
14895
14896         /*
14897          * Technically we could use raw_spin_[un]lock_irq() here, but this can
14898 @@ -1187,6 +1765,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
14899
14900         /* Try to acquire the lock again: */
14901         if (try_to_take_rt_mutex(lock, current, NULL)) {
14902 +               if (ww_ctx)
14903 +                       ww_mutex_account_lock(lock, ww_ctx);
14904                 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14905                 return 0;
14906         }
14907 @@ -1201,13 +1781,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
14908
14909         if (likely(!ret))
14910                 /* sleep on the mutex */
14911 -               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
14912 +               ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
14913 +                                         ww_ctx);
14914 +       else if (ww_ctx) {
14915 +               /* ww_mutex received EDEADLK, let it become EALREADY */
14916 +               ret = __mutex_lock_check_stamp(lock, ww_ctx);
14917 +               BUG_ON(!ret);
14918 +       }
14919
14920         if (unlikely(ret)) {
14921                 __set_current_state(TASK_RUNNING);
14922                 if (rt_mutex_has_waiters(lock))
14923                         remove_waiter(lock, &waiter);
14924 -               rt_mutex_handle_deadlock(ret, chwalk, &waiter);
14925 +               /* ww_mutex want to report EDEADLK/EALREADY, let them */
14926 +               if (!ww_ctx)
14927 +                       rt_mutex_handle_deadlock(ret, chwalk, &waiter);
14928 +       } else if (ww_ctx) {
14929 +               ww_mutex_account_lock(lock, ww_ctx);
14930         }
14931
14932         /*
14933 @@ -1267,7 +1857,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
14934   * Return whether the current task needs to undo a potential priority boosting.
14935   */
14936  static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14937 -                                       struct wake_q_head *wake_q)
14938 +                                       struct wake_q_head *wake_q,
14939 +                                       struct wake_q_head *wake_sleeper_q)
14940  {
14941         unsigned long flags;
14942
14943 @@ -1323,7 +1914,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14944          *
14945          * Queue the next waiter for wakeup once we release the wait_lock.
14946          */
14947 -       mark_wakeup_next_waiter(wake_q, lock);
14948 +       mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock);
14949
14950         raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
14951
14952 @@ -1339,31 +1930,36 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
14953   */
14954  static inline int
14955  rt_mutex_fastlock(struct rt_mutex *lock, int state,
14956 +                 struct ww_acquire_ctx *ww_ctx,
14957                   int (*slowfn)(struct rt_mutex *lock, int state,
14958                                 struct hrtimer_sleeper *timeout,
14959 -                               enum rtmutex_chainwalk chwalk))
14960 +                               enum rtmutex_chainwalk chwalk,
14961 +                               struct ww_acquire_ctx *ww_ctx))
14962  {
14963         if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14964                 rt_mutex_deadlock_account_lock(lock, current);
14965                 return 0;
14966         } else
14967 -               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
14968 +               return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
14969 +                             ww_ctx);
14970  }
14971
14972  static inline int
14973  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
14974                         struct hrtimer_sleeper *timeout,
14975                         enum rtmutex_chainwalk chwalk,
14976 +                       struct ww_acquire_ctx *ww_ctx,
14977                         int (*slowfn)(struct rt_mutex *lock, int state,
14978                                       struct hrtimer_sleeper *timeout,
14979 -                                     enum rtmutex_chainwalk chwalk))
14980 +                                     enum rtmutex_chainwalk chwalk,
14981 +                                     struct ww_acquire_ctx *ww_ctx))
14982  {
14983         if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
14984             likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
14985                 rt_mutex_deadlock_account_lock(lock, current);
14986                 return 0;
14987         } else
14988 -               return slowfn(lock, state, timeout, chwalk);
14989 +               return slowfn(lock, state, timeout, chwalk, ww_ctx);
14990  }
14991
14992  static inline int
14993 @@ -1380,17 +1976,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lock,
14994  static inline void
14995  rt_mutex_fastunlock(struct rt_mutex *lock,
14996                     bool (*slowfn)(struct rt_mutex *lock,
14997 -                                  struct wake_q_head *wqh))
14998 +                                  struct wake_q_head *wqh,
14999 +                                  struct wake_q_head *wq_sleeper))
15000  {
15001         WAKE_Q(wake_q);
15002 +       WAKE_Q(wake_sleeper_q);
15003
15004         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15005                 rt_mutex_deadlock_account_unlock(current);
15006
15007         } else {
15008 -               bool deboost = slowfn(lock, &wake_q);
15009 +               bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
15010
15011                 wake_up_q(&wake_q);
15012 +               wake_up_q_sleeper(&wake_sleeper_q);
15013
15014                 /* Undo pi boosting if necessary: */
15015                 if (deboost)
15016 @@ -1407,7 +2006,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
15017  {
15018         might_sleep();
15019
15020 -       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
15021 +       rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, NULL, rt_mutex_slowlock);
15022  }
15023  EXPORT_SYMBOL_GPL(rt_mutex_lock);
15024
15025 @@ -1424,7 +2023,7 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
15026  {
15027         might_sleep();
15028
15029 -       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
15030 +       return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, NULL, rt_mutex_slowlock);
15031  }
15032  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
15033
15034 @@ -1437,11 +2036,30 @@ int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
15035         might_sleep();
15036
15037         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15038 -                                      RT_MUTEX_FULL_CHAINWALK,
15039 +                                      RT_MUTEX_FULL_CHAINWALK, NULL,
15040                                        rt_mutex_slowlock);
15041  }
15042
15043  /**
15044 + * rt_mutex_lock_killable - lock a rt_mutex killable
15045 + *
15046 + * @lock:              the rt_mutex to be locked
15047 + * @detect_deadlock:   deadlock detection on/off
15048 + *
15049 + * Returns:
15050 + *  0          on success
15051 + * -EINTR      when interrupted by a signal
15052 + * -EDEADLK    when the lock would deadlock (when deadlock detection is on)
15053 + */
15054 +int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
15055 +{
15056 +       might_sleep();
15057 +
15058 +       return rt_mutex_fastlock(lock, TASK_KILLABLE, NULL, rt_mutex_slowlock);
15059 +}
15060 +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
15061 +
15062 +/**
15063   * rt_mutex_timed_lock - lock a rt_mutex interruptible
15064   *                     the timeout structure is provided
15065   *                     by the caller
15066 @@ -1461,6 +2079,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
15067
15068         return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
15069                                        RT_MUTEX_MIN_CHAINWALK,
15070 +                                      NULL,
15071                                        rt_mutex_slowlock);
15072  }
15073  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15074 @@ -1478,7 +2097,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
15075   */
15076  int __sched rt_mutex_trylock(struct rt_mutex *lock)
15077  {
15078 +#ifdef CONFIG_PREEMPT_RT_FULL
15079 +       if (WARN_ON_ONCE(in_irq() || in_nmi()))
15080 +#else
15081         if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
15082 +#endif
15083                 return 0;
15084
15085         return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
15086 @@ -1504,13 +2127,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
15087   * required or not.
15088   */
15089  bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
15090 -                                  struct wake_q_head *wqh)
15091 +                                  struct wake_q_head *wqh,
15092 +                                  struct wake_q_head *wq_sleeper)
15093  {
15094         if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
15095                 rt_mutex_deadlock_account_unlock(current);
15096                 return false;
15097         }
15098 -       return rt_mutex_slowunlock(lock, wqh);
15099 +       return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
15100  }
15101
15102  /**
15103 @@ -1543,13 +2167,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
15104  void __rt_mutex_init(struct rt_mutex *lock, const char *name)
15105  {
15106         lock->owner = NULL;
15107 -       raw_spin_lock_init(&lock->wait_lock);
15108         lock->waiters = RB_ROOT;
15109         lock->waiters_leftmost = NULL;
15110
15111         debug_rt_mutex_init(lock, name);
15112  }
15113 -EXPORT_SYMBOL_GPL(__rt_mutex_init);
15114 +EXPORT_SYMBOL(__rt_mutex_init);
15115
15116  /**
15117   * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
15118 @@ -1564,7 +2187,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
15119  void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
15120                                 struct task_struct *proxy_owner)
15121  {
15122 -       __rt_mutex_init(lock, NULL);
15123 +       rt_mutex_init(lock);
15124         debug_rt_mutex_proxy_lock(lock, proxy_owner);
15125         rt_mutex_set_owner(lock, proxy_owner);
15126         rt_mutex_deadlock_account_lock(lock, proxy_owner);
15127 @@ -1612,6 +2235,35 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15128                 return 1;
15129         }
15130
15131 +#ifdef CONFIG_PREEMPT_RT_FULL
15132 +       /*
15133 +        * In PREEMPT_RT there's an added race.
15134 +        * If the task, that we are about to requeue, times out,
15135 +        * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
15136 +        * to skip this task. But right after the task sets
15137 +        * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
15138 +        * block on the spin_lock(&hb->lock), which in RT is an rtmutex.
15139 +        * This will replace the PI_WAKEUP_INPROGRESS with the actual
15140 +        * lock that it blocks on. We *must not* place this task
15141 +        * on this proxy lock in that case.
15142 +        *
15143 +        * To prevent this race, we first take the task's pi_lock
15144 +        * and check if it has updated its pi_blocked_on. If it has,
15145 +        * we assume that it woke up and we return -EAGAIN.
15146 +        * Otherwise, we set the task's pi_blocked_on to
15147 +        * PI_REQUEUE_INPROGRESS, so that if the task is waking up
15148 +        * it will know that we are in the process of requeuing it.
15149 +        */
15150 +       raw_spin_lock(&task->pi_lock);
15151 +       if (task->pi_blocked_on) {
15152 +               raw_spin_unlock(&task->pi_lock);
15153 +               raw_spin_unlock_irq(&lock->wait_lock);
15154 +               return -EAGAIN;
15155 +       }
15156 +       task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
15157 +       raw_spin_unlock(&task->pi_lock);
15158 +#endif
15159 +
15160         /* We enforce deadlock detection for futexes */
15161         ret = task_blocks_on_rt_mutex(lock, waiter, task,
15162                                       RT_MUTEX_FULL_CHAINWALK);
15163 @@ -1626,7 +2278,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
15164                 ret = 0;
15165         }
15166
15167 -       if (unlikely(ret))
15168 +       if (ret && rt_mutex_has_waiters(lock))
15169                 remove_waiter(lock, waiter);
15170
15171         raw_spin_unlock_irq(&lock->wait_lock);
15172 @@ -1682,7 +2334,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15173         set_current_state(TASK_INTERRUPTIBLE);
15174
15175         /* sleep on the mutex */
15176 -       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
15177 +       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
15178
15179         if (unlikely(ret))
15180                 remove_waiter(lock, waiter);
15181 @@ -1697,3 +2349,89 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15182
15183         return ret;
15184  }
15185 +
15186 +static inline int
15187 +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
15188 +{
15189 +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
15190 +       unsigned tmp;
15191 +
15192 +       if (ctx->deadlock_inject_countdown-- == 0) {
15193 +               tmp = ctx->deadlock_inject_interval;
15194 +               if (tmp > UINT_MAX/4)
15195 +                       tmp = UINT_MAX;
15196 +               else
15197 +                       tmp = tmp*2 + tmp + tmp/2;
15198 +
15199 +               ctx->deadlock_inject_interval = tmp;
15200 +               ctx->deadlock_inject_countdown = tmp;
15201 +               ctx->contending_lock = lock;
15202 +
15203 +               ww_mutex_unlock(lock);
15204 +
15205 +               return -EDEADLK;
15206 +       }
15207 +#endif
15208 +
15209 +       return 0;
15210 +}
15211 +
15212 +#ifdef CONFIG_PREEMPT_RT_FULL
15213 +int __sched
15214 +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
15215 +{
15216 +       int ret;
15217 +
15218 +       might_sleep();
15219 +
15220 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
15221 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ww_ctx);
15222 +       if (ret)
15223 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
15224 +       else if (!ret && ww_ctx->acquired > 1)
15225 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
15226 +
15227 +       return ret;
15228 +}
15229 +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
15230 +
15231 +int __sched
15232 +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx)
15233 +{
15234 +       int ret;
15235 +
15236 +       might_sleep();
15237 +
15238 +       mutex_acquire_nest(&lock->base.dep_map, 0, 0, &ww_ctx->dep_map, _RET_IP_);
15239 +       ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ww_ctx);
15240 +       if (ret)
15241 +               mutex_release(&lock->base.dep_map, 1, _RET_IP_);
15242 +       else if (!ret && ww_ctx->acquired > 1)
15243 +               return ww_mutex_deadlock_injection(lock, ww_ctx);
15244 +
15245 +       return ret;
15246 +}
15247 +EXPORT_SYMBOL_GPL(__ww_mutex_lock);
15248 +
15249 +void __sched ww_mutex_unlock(struct ww_mutex *lock)
15250 +{
15251 +       int nest = !!lock->ctx;
15252 +
15253 +       /*
15254 +        * The unlocking fastpath is the 0->1 transition from 'locked'
15255 +        * into 'unlocked' state:
15256 +        */
15257 +       if (nest) {
15258 +#ifdef CONFIG_DEBUG_MUTEXES
15259 +               DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
15260 +#endif
15261 +               if (lock->ctx->acquired > 0)
15262 +                       lock->ctx->acquired--;
15263 +               lock->ctx = NULL;
15264 +       }
15265 +
15266 +       mutex_release(&lock->base.dep_map, nest, _RET_IP_);
15267 +       rt_mutex_unlock(&lock->base.lock);
15268 +}
15269 +EXPORT_SYMBOL(ww_mutex_unlock);
15270 +#endif
15271 diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
15272 index 4f5f83c7d2d3..289f062f26cd 100644
15273 --- a/kernel/locking/rtmutex_common.h
15274 +++ b/kernel/locking/rtmutex_common.h
15275 @@ -27,6 +27,7 @@ struct rt_mutex_waiter {
15276         struct rb_node          pi_tree_entry;
15277         struct task_struct      *task;
15278         struct rt_mutex         *lock;
15279 +       bool                    savestate;
15280  #ifdef CONFIG_DEBUG_RT_MUTEXES
15281         unsigned long           ip;
15282         struct pid              *deadlock_task_pid;
15283 @@ -97,6 +98,9 @@ enum rtmutex_chainwalk {
15284  /*
15285   * PI-futex support (proxy locking functions, etc.):
15286   */
15287 +#define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
15288 +#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
15289 +
15290  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
15291  extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
15292                                        struct task_struct *proxy_owner);
15293 @@ -110,7 +114,8 @@ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
15294                                       struct rt_mutex_waiter *waiter);
15295  extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
15296  extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
15297 -                                 struct wake_q_head *wqh);
15298 +                                 struct wake_q_head *wqh,
15299 +                                 struct wake_q_head *wq_sleeper);
15300  extern void rt_mutex_adjust_prio(struct task_struct *task);
15301
15302  #ifdef CONFIG_DEBUG_RT_MUTEXES
15303 @@ -119,4 +124,14 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
15304  # include "rtmutex.h"
15305  #endif
15306
15307 +static inline void
15308 +rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
15309 +{
15310 +       debug_rt_mutex_init_waiter(waiter);
15311 +       waiter->task = NULL;
15312 +       waiter->savestate = savestate;
15313 +       RB_CLEAR_NODE(&waiter->pi_tree_entry);
15314 +       RB_CLEAR_NODE(&waiter->tree_entry);
15315 +}
15316 +
15317  #endif
15318 diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
15319 index db3ccb1dd614..909779647bd1 100644
15320 --- a/kernel/locking/spinlock.c
15321 +++ b/kernel/locking/spinlock.c
15322 @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock)           \
15323   *         __[spin|read|write]_lock_bh()
15324   */
15325  BUILD_LOCK_OPS(spin, raw_spinlock);
15326 +
15327 +#ifndef CONFIG_PREEMPT_RT_FULL
15328  BUILD_LOCK_OPS(read, rwlock);
15329  BUILD_LOCK_OPS(write, rwlock);
15330 +#endif
15331
15332  #endif
15333
15334 @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
15335  EXPORT_SYMBOL(_raw_spin_unlock_bh);
15336  #endif
15337
15338 +#ifndef CONFIG_PREEMPT_RT_FULL
15339 +
15340  #ifndef CONFIG_INLINE_READ_TRYLOCK
15341  int __lockfunc _raw_read_trylock(rwlock_t *lock)
15342  {
15343 @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
15344  EXPORT_SYMBOL(_raw_write_unlock_bh);
15345  #endif
15346
15347 +#endif /* !PREEMPT_RT_FULL */
15348 +
15349  #ifdef CONFIG_DEBUG_LOCK_ALLOC
15350
15351  void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
15352 diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
15353 index 0374a596cffa..94970338d518 100644
15354 --- a/kernel/locking/spinlock_debug.c
15355 +++ b/kernel/locking/spinlock_debug.c
15356 @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
15357
15358  EXPORT_SYMBOL(__raw_spin_lock_init);
15359
15360 +#ifndef CONFIG_PREEMPT_RT_FULL
15361  void __rwlock_init(rwlock_t *lock, const char *name,
15362                    struct lock_class_key *key)
15363  {
15364 @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name,
15365  }
15366
15367  EXPORT_SYMBOL(__rwlock_init);
15368 +#endif
15369
15370  static void spin_dump(raw_spinlock_t *lock, const char *msg)
15371  {
15372 @@ -159,6 +161,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock)
15373         arch_spin_unlock(&lock->raw_lock);
15374  }
15375
15376 +#ifndef CONFIG_PREEMPT_RT_FULL
15377  static void rwlock_bug(rwlock_t *lock, const char *msg)
15378  {
15379         if (!debug_locks_off())
15380 @@ -300,3 +303,5 @@ void do_raw_write_unlock(rwlock_t *lock)
15381         debug_write_unlock(lock);
15382         arch_write_unlock(&lock->raw_lock);
15383  }
15384 +
15385 +#endif
15386 diff --git a/kernel/panic.c b/kernel/panic.c
15387 index ca8cea1ef673..6b698115f003 100644
15388 --- a/kernel/panic.c
15389 +++ b/kernel/panic.c
15390 @@ -449,9 +449,11 @@ static u64 oops_id;
15391
15392  static int init_oops_id(void)
15393  {
15394 +#ifndef CONFIG_PREEMPT_RT_FULL
15395         if (!oops_id)
15396                 get_random_bytes(&oops_id, sizeof(oops_id));
15397         else
15398 +#endif
15399                 oops_id++;
15400
15401         return 0;
15402 diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
15403 index 33c79b6105c5..f53375bc77df 100644
15404 --- a/kernel/power/hibernate.c
15405 +++ b/kernel/power/hibernate.c
15406 @@ -286,6 +286,8 @@ static int create_image(int platform_mode)
15407
15408         local_irq_disable();
15409
15410 +       system_state = SYSTEM_SUSPEND;
15411 +
15412         error = syscore_suspend();
15413         if (error) {
15414                 printk(KERN_ERR "PM: Some system devices failed to power down, "
15415 @@ -315,6 +317,7 @@ static int create_image(int platform_mode)
15416         syscore_resume();
15417
15418   Enable_irqs:
15419 +       system_state = SYSTEM_RUNNING;
15420         local_irq_enable();
15421
15422   Enable_cpus:
15423 @@ -444,6 +447,7 @@ static int resume_target_kernel(bool platform_mode)
15424                 goto Enable_cpus;
15425
15426         local_irq_disable();
15427 +       system_state = SYSTEM_SUSPEND;
15428
15429         error = syscore_suspend();
15430         if (error)
15431 @@ -477,6 +481,7 @@ static int resume_target_kernel(bool platform_mode)
15432         syscore_resume();
15433
15434   Enable_irqs:
15435 +       system_state = SYSTEM_RUNNING;
15436         local_irq_enable();
15437
15438   Enable_cpus:
15439 @@ -562,6 +567,7 @@ int hibernation_platform_enter(void)
15440                 goto Enable_cpus;
15441
15442         local_irq_disable();
15443 +       system_state = SYSTEM_SUSPEND;
15444         syscore_suspend();
15445         if (pm_wakeup_pending()) {
15446                 error = -EAGAIN;
15447 @@ -574,6 +580,7 @@ int hibernation_platform_enter(void)
15448
15449   Power_up:
15450         syscore_resume();
15451 +       system_state = SYSTEM_RUNNING;
15452         local_irq_enable();
15453
15454   Enable_cpus:
15455 @@ -674,6 +681,10 @@ static int load_image_and_restore(void)
15456         return error;
15457  }
15458
15459 +#ifndef CONFIG_SUSPEND
15460 +bool pm_in_action;
15461 +#endif
15462 +
15463  /**
15464   * hibernate - Carry out system hibernation, including saving the image.
15465   */
15466 @@ -687,6 +698,8 @@ int hibernate(void)
15467                 return -EPERM;
15468         }
15469
15470 +       pm_in_action = true;
15471 +
15472         lock_system_sleep();
15473         /* The snapshot device should not be opened while we're running */
15474         if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
15475 @@ -764,6 +777,7 @@ int hibernate(void)
15476         atomic_inc(&snapshot_device_available);
15477   Unlock:
15478         unlock_system_sleep();
15479 +       pm_in_action = false;
15480         return error;
15481  }
15482
15483 diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
15484 index 0acab9d7f96f..aac06aad757c 100644
15485 --- a/kernel/power/suspend.c
15486 +++ b/kernel/power/suspend.c
15487 @@ -361,6 +361,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
15488         arch_suspend_disable_irqs();
15489         BUG_ON(!irqs_disabled());
15490
15491 +       system_state = SYSTEM_SUSPEND;
15492 +
15493         error = syscore_suspend();
15494         if (!error) {
15495                 *wakeup = pm_wakeup_pending();
15496 @@ -377,6 +379,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
15497                 syscore_resume();
15498         }
15499
15500 +       system_state = SYSTEM_RUNNING;
15501 +
15502         arch_suspend_enable_irqs();
15503         BUG_ON(irqs_disabled());
15504
15505 @@ -519,6 +523,8 @@ static int enter_state(suspend_state_t state)
15506         return error;
15507  }
15508
15509 +bool pm_in_action;
15510 +
15511  /**
15512   * pm_suspend - Externally visible function for suspending the system.
15513   * @state: System sleep state to enter.
15514 @@ -533,6 +539,8 @@ int pm_suspend(suspend_state_t state)
15515         if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
15516                 return -EINVAL;
15517
15518 +       pm_in_action = true;
15519 +
15520         error = enter_state(state);
15521         if (error) {
15522                 suspend_stats.fail++;
15523 @@ -540,6 +548,7 @@ int pm_suspend(suspend_state_t state)
15524         } else {
15525                 suspend_stats.success++;
15526         }
15527 +       pm_in_action = false;
15528         return error;
15529  }
15530  EXPORT_SYMBOL(pm_suspend);
15531 diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
15532 index eea6dbc2d8cf..6f01c7ecb45e 100644
15533 --- a/kernel/printk/printk.c
15534 +++ b/kernel/printk/printk.c
15535 @@ -351,6 +351,65 @@ __packed __aligned(4)
15536   */
15537  DEFINE_RAW_SPINLOCK(logbuf_lock);
15538
15539 +#ifdef CONFIG_EARLY_PRINTK
15540 +struct console *early_console;
15541 +
15542 +static void early_vprintk(const char *fmt, va_list ap)
15543 +{
15544 +       if (early_console) {
15545 +               char buf[512];
15546 +               int n = vscnprintf(buf, sizeof(buf), fmt, ap);
15547 +
15548 +               early_console->write(early_console, buf, n);
15549 +       }
15550 +}
15551 +
15552 +asmlinkage void early_printk(const char *fmt, ...)
15553 +{
15554 +       va_list ap;
15555 +
15556 +       va_start(ap, fmt);
15557 +       early_vprintk(fmt, ap);
15558 +       va_end(ap);
15559 +}
15560 +
15561 +/*
15562 + * This is independent of any log levels - a global
15563 + * kill switch that turns off all of printk.
15564 + *
15565 + * Used by the NMI watchdog if early-printk is enabled.
15566 + */
15567 +static bool __read_mostly printk_killswitch;
15568 +
15569 +static int __init force_early_printk_setup(char *str)
15570 +{
15571 +       printk_killswitch = true;
15572 +       return 0;
15573 +}
15574 +early_param("force_early_printk", force_early_printk_setup);
15575 +
15576 +void printk_kill(void)
15577 +{
15578 +       printk_killswitch = true;
15579 +}
15580 +
15581 +#ifdef CONFIG_PRINTK
15582 +static int forced_early_printk(const char *fmt, va_list ap)
15583 +{
15584 +       if (!printk_killswitch)
15585 +               return 0;
15586 +       early_vprintk(fmt, ap);
15587 +       return 1;
15588 +}
15589 +#endif
15590 +
15591 +#else
15592 +static inline int forced_early_printk(const char *fmt, va_list ap)
15593 +{
15594 +       return 0;
15595 +}
15596 +#endif
15597 +
15598  #ifdef CONFIG_PRINTK
15599  DECLARE_WAIT_QUEUE_HEAD(log_wait);
15600  /* the next printk record to read by syslog(READ) or /proc/kmsg */
15601 @@ -1340,6 +1399,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15602  {
15603         char *text;
15604         int len = 0;
15605 +       int attempts = 0;
15606
15607         text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
15608         if (!text)
15609 @@ -1351,6 +1411,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15610                 u64 seq;
15611                 u32 idx;
15612                 enum log_flags prev;
15613 +               int num_msg;
15614 +try_again:
15615 +               attempts++;
15616 +               if (attempts > 10) {
15617 +                       len = -EBUSY;
15618 +                       goto out;
15619 +               }
15620 +               num_msg = 0;
15621
15622                 /*
15623                  * Find first record that fits, including all following records,
15624 @@ -1366,6 +1434,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15625                         prev = msg->flags;
15626                         idx = log_next(idx);
15627                         seq++;
15628 +                       num_msg++;
15629 +                       if (num_msg > 5) {
15630 +                               num_msg = 0;
15631 +                               raw_spin_unlock_irq(&logbuf_lock);
15632 +                               raw_spin_lock_irq(&logbuf_lock);
15633 +                               if (clear_seq < log_first_seq)
15634 +                                       goto try_again;
15635 +                       }
15636                 }
15637
15638                 /* move first record forward until length fits into the buffer */
15639 @@ -1379,6 +1455,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15640                         prev = msg->flags;
15641                         idx = log_next(idx);
15642                         seq++;
15643 +                       num_msg++;
15644 +                       if (num_msg > 5) {
15645 +                               num_msg = 0;
15646 +                               raw_spin_unlock_irq(&logbuf_lock);
15647 +                               raw_spin_lock_irq(&logbuf_lock);
15648 +                               if (clear_seq < log_first_seq)
15649 +                                       goto try_again;
15650 +                       }
15651                 }
15652
15653                 /* last message fitting into this dump */
15654 @@ -1419,6 +1503,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
15655                 clear_seq = log_next_seq;
15656                 clear_idx = log_next_idx;
15657         }
15658 +out:
15659         raw_spin_unlock_irq(&logbuf_lock);
15660
15661         kfree(text);
15662 @@ -1572,6 +1657,12 @@ static void call_console_drivers(int level,
15663         if (!console_drivers)
15664                 return;
15665
15666 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
15667 +               if (in_irq() || in_nmi())
15668 +                       return;
15669 +       }
15670 +
15671 +       migrate_disable();
15672         for_each_console(con) {
15673                 if (exclusive_console && con != exclusive_console)
15674                         continue;
15675 @@ -1587,6 +1678,7 @@ static void call_console_drivers(int level,
15676                 else
15677                         con->write(con, text, len);
15678         }
15679 +       migrate_enable();
15680  }
15681
15682  /*
15683 @@ -1750,6 +1842,13 @@ asmlinkage int vprintk_emit(int facility, int level,
15684         /* cpu currently holding logbuf_lock in this function */
15685         static unsigned int logbuf_cpu = UINT_MAX;
15686
15687 +       /*
15688 +        * Fall back to early_printk if a debugging subsystem has
15689 +        * killed printk output
15690 +        */
15691 +       if (unlikely(forced_early_printk(fmt, args)))
15692 +               return 1;
15693 +
15694         if (level == LOGLEVEL_SCHED) {
15695                 level = LOGLEVEL_DEFAULT;
15696                 in_sched = true;
15697 @@ -1894,13 +1993,23 @@ asmlinkage int vprintk_emit(int facility, int level,
15698
15699         /* If called from the scheduler, we can not call up(). */
15700         if (!in_sched) {
15701 +               int may_trylock = 1;
15702 +
15703                 lockdep_off();
15704 +#ifdef CONFIG_PREEMPT_RT_FULL
15705 +               /*
15706 +                * we can't take a sleeping lock with IRQs or preeption disabled
15707 +                * so we can't print in these contexts
15708 +                */
15709 +               if (!(preempt_count() == 0 && !irqs_disabled()))
15710 +                       may_trylock = 0;
15711 +#endif
15712                 /*
15713                  * Try to acquire and then immediately release the console
15714                  * semaphore.  The release will print out buffers and wake up
15715                  * /dev/kmsg and syslog() users.
15716                  */
15717 -               if (console_trylock())
15718 +               if (may_trylock && console_trylock())
15719                         console_unlock();
15720                 lockdep_on();
15721         }
15722 @@ -2023,26 +2132,6 @@ DEFINE_PER_CPU(printk_func_t, printk_func);
15723
15724  #endif /* CONFIG_PRINTK */
15725
15726 -#ifdef CONFIG_EARLY_PRINTK
15727 -struct console *early_console;
15728 -
15729 -asmlinkage __visible void early_printk(const char *fmt, ...)
15730 -{
15731 -       va_list ap;
15732 -       char buf[512];
15733 -       int n;
15734 -
15735 -       if (!early_console)
15736 -               return;
15737 -
15738 -       va_start(ap, fmt);
15739 -       n = vscnprintf(buf, sizeof(buf), fmt, ap);
15740 -       va_end(ap);
15741 -
15742 -       early_console->write(early_console, buf, n);
15743 -}
15744 -#endif
15745 -
15746  static int __add_preferred_console(char *name, int idx, char *options,
15747                                    char *brl_options)
15748  {
15749 @@ -2312,11 +2401,16 @@ static void console_cont_flush(char *text, size_t size)
15750                 goto out;
15751
15752         len = cont_print_text(text, size);
15753 +#ifdef CONFIG_PREEMPT_RT_FULL
15754 +       raw_spin_unlock_irqrestore(&logbuf_lock, flags);
15755 +       call_console_drivers(cont.level, NULL, 0, text, len);
15756 +#else
15757         raw_spin_unlock(&logbuf_lock);
15758         stop_critical_timings();
15759         call_console_drivers(cont.level, NULL, 0, text, len);
15760         start_critical_timings();
15761         local_irq_restore(flags);
15762 +#endif
15763         return;
15764  out:
15765         raw_spin_unlock_irqrestore(&logbuf_lock, flags);
15766 @@ -2440,13 +2534,17 @@ void console_unlock(void)
15767                 console_idx = log_next(console_idx);
15768                 console_seq++;
15769                 console_prev = msg->flags;
15770 +#ifdef CONFIG_PREEMPT_RT_FULL
15771 +               raw_spin_unlock_irqrestore(&logbuf_lock, flags);
15772 +               call_console_drivers(level, ext_text, ext_len, text, len);
15773 +#else
15774                 raw_spin_unlock(&logbuf_lock);
15775
15776                 stop_critical_timings();        /* don't trace print latency */
15777                 call_console_drivers(level, ext_text, ext_len, text, len);
15778                 start_critical_timings();
15779                 local_irq_restore(flags);
15780 -
15781 +#endif
15782                 if (do_cond_resched)
15783                         cond_resched();
15784         }
15785 @@ -2498,6 +2596,11 @@ void console_unblank(void)
15786  {
15787         struct console *c;
15788
15789 +       if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) {
15790 +               if (in_irq() || in_nmi())
15791 +                       return;
15792 +       }
15793 +
15794         /*
15795          * console_unblank can no longer be called in interrupt context unless
15796          * oops_in_progress is set to 1..
15797 diff --git a/kernel/ptrace.c b/kernel/ptrace.c
15798 index 1d3b7665d0be..ce666639789d 100644
15799 --- a/kernel/ptrace.c
15800 +++ b/kernel/ptrace.c
15801 @@ -128,7 +128,14 @@ static bool ptrace_freeze_traced(struct task_struct *task)
15802
15803         spin_lock_irq(&task->sighand->siglock);
15804         if (task_is_traced(task) && !__fatal_signal_pending(task)) {
15805 -               task->state = __TASK_TRACED;
15806 +               unsigned long flags;
15807 +
15808 +               raw_spin_lock_irqsave(&task->pi_lock, flags);
15809 +               if (task->state & __TASK_TRACED)
15810 +                       task->state = __TASK_TRACED;
15811 +               else
15812 +                       task->saved_state = __TASK_TRACED;
15813 +               raw_spin_unlock_irqrestore(&task->pi_lock, flags);
15814                 ret = true;
15815         }
15816         spin_unlock_irq(&task->sighand->siglock);
15817 diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
15818 index 971e2b138063..a304670fb917 100644
15819 --- a/kernel/rcu/rcutorture.c
15820 +++ b/kernel/rcu/rcutorture.c
15821 @@ -404,6 +404,7 @@ static struct rcu_torture_ops rcu_ops = {
15822         .name           = "rcu"
15823  };
15824
15825 +#ifndef CONFIG_PREEMPT_RT_FULL
15826  /*
15827   * Definitions for rcu_bh torture testing.
15828   */
15829 @@ -443,6 +444,12 @@ static struct rcu_torture_ops rcu_bh_ops = {
15830         .name           = "rcu_bh"
15831  };
15832
15833 +#else
15834 +static struct rcu_torture_ops rcu_bh_ops = {
15835 +       .ttype          = INVALID_RCU_FLAVOR,
15836 +};
15837 +#endif
15838 +
15839  /*
15840   * Don't even think about trying any of these in real life!!!
15841   * The names includes "busted", and they really means it!
15842 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
15843 index 5d80925e7fc8..2b4bc2b2c25a 100644
15844 --- a/kernel/rcu/tree.c
15845 +++ b/kernel/rcu/tree.c
15846 @@ -56,6 +56,11 @@
15847  #include <linux/random.h>
15848  #include <linux/trace_events.h>
15849  #include <linux/suspend.h>
15850 +#include <linux/delay.h>
15851 +#include <linux/gfp.h>
15852 +#include <linux/oom.h>
15853 +#include <linux/smpboot.h>
15854 +#include "../time/tick-internal.h"
15855
15856  #include "tree.h"
15857  #include "rcu.h"
15858 @@ -259,6 +264,19 @@ void rcu_sched_qs(void)
15859                            this_cpu_ptr(&rcu_sched_data), true);
15860  }
15861
15862 +#ifdef CONFIG_PREEMPT_RT_FULL
15863 +static void rcu_preempt_qs(void);
15864 +
15865 +void rcu_bh_qs(void)
15866 +{
15867 +       unsigned long flags;
15868 +
15869 +       /* Callers to this function, rcu_preempt_qs(), must disable irqs. */
15870 +       local_irq_save(flags);
15871 +       rcu_preempt_qs();
15872 +       local_irq_restore(flags);
15873 +}
15874 +#else
15875  void rcu_bh_qs(void)
15876  {
15877         if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
15878 @@ -268,6 +286,7 @@ void rcu_bh_qs(void)
15879                 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
15880         }
15881  }
15882 +#endif
15883
15884  static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
15885
15886 @@ -448,11 +467,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
15887  /*
15888   * Return the number of RCU BH batches started thus far for debug & stats.
15889   */
15890 +#ifndef CONFIG_PREEMPT_RT_FULL
15891  unsigned long rcu_batches_started_bh(void)
15892  {
15893         return rcu_bh_state.gpnum;
15894  }
15895  EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
15896 +#endif
15897
15898  /*
15899   * Return the number of RCU batches completed thus far for debug & stats.
15900 @@ -472,6 +493,7 @@ unsigned long rcu_batches_completed_sched(void)
15901  }
15902  EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
15903
15904 +#ifndef CONFIG_PREEMPT_RT_FULL
15905  /*
15906   * Return the number of RCU BH batches completed thus far for debug & stats.
15907   */
15908 @@ -480,6 +502,7 @@ unsigned long rcu_batches_completed_bh(void)
15909         return rcu_bh_state.completed;
15910  }
15911  EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
15912 +#endif
15913
15914  /*
15915   * Return the number of RCU expedited batches completed thus far for
15916 @@ -503,6 +526,7 @@ unsigned long rcu_exp_batches_completed_sched(void)
15917  }
15918  EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
15919
15920 +#ifndef CONFIG_PREEMPT_RT_FULL
15921  /*
15922   * Force a quiescent state.
15923   */
15924 @@ -521,6 +545,13 @@ void rcu_bh_force_quiescent_state(void)
15925  }
15926  EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
15927
15928 +#else
15929 +void rcu_force_quiescent_state(void)
15930 +{
15931 +}
15932 +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
15933 +#endif
15934 +
15935  /*
15936   * Force a quiescent state for RCU-sched.
15937   */
15938 @@ -571,9 +602,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
15939         case RCU_FLAVOR:
15940                 rsp = rcu_state_p;
15941                 break;
15942 +#ifndef CONFIG_PREEMPT_RT_FULL
15943         case RCU_BH_FLAVOR:
15944                 rsp = &rcu_bh_state;
15945                 break;
15946 +#endif
15947         case RCU_SCHED_FLAVOR:
15948                 rsp = &rcu_sched_state;
15949                 break;
15950 @@ -3013,18 +3046,17 @@ __rcu_process_callbacks(struct rcu_state *rsp)
15951  /*
15952   * Do RCU core processing for the current CPU.
15953   */
15954 -static void rcu_process_callbacks(struct softirq_action *unused)
15955 +static void rcu_process_callbacks(void)
15956  {
15957         struct rcu_state *rsp;
15958
15959         if (cpu_is_offline(smp_processor_id()))
15960                 return;
15961 -       trace_rcu_utilization(TPS("Start RCU core"));
15962         for_each_rcu_flavor(rsp)
15963                 __rcu_process_callbacks(rsp);
15964 -       trace_rcu_utilization(TPS("End RCU core"));
15965  }
15966
15967 +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
15968  /*
15969   * Schedule RCU callback invocation.  If the specified type of RCU
15970   * does not support RCU priority boosting, just do a direct call,
15971 @@ -3036,19 +3068,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
15972  {
15973         if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
15974                 return;
15975 -       if (likely(!rsp->boost)) {
15976 -               rcu_do_batch(rsp, rdp);
15977 -               return;
15978 -       }
15979 -       invoke_rcu_callbacks_kthread();
15980 +       rcu_do_batch(rsp, rdp);
15981  }
15982
15983 +static void rcu_wake_cond(struct task_struct *t, int status)
15984 +{
15985 +       /*
15986 +        * If the thread is yielding, only wake it when this
15987 +        * is invoked from idle
15988 +        */
15989 +       if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
15990 +               wake_up_process(t);
15991 +}
15992 +
15993 +/*
15994 + * Wake up this CPU's rcuc kthread to do RCU core processing.
15995 + */
15996  static void invoke_rcu_core(void)
15997  {
15998 -       if (cpu_online(smp_processor_id()))
15999 -               raise_softirq(RCU_SOFTIRQ);
16000 +       unsigned long flags;
16001 +       struct task_struct *t;
16002 +
16003 +       if (!cpu_online(smp_processor_id()))
16004 +               return;
16005 +       local_irq_save(flags);
16006 +       __this_cpu_write(rcu_cpu_has_work, 1);
16007 +       t = __this_cpu_read(rcu_cpu_kthread_task);
16008 +       if (t != NULL && current != t)
16009 +               rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status));
16010 +       local_irq_restore(flags);
16011  }
16012
16013 +static void rcu_cpu_kthread_park(unsigned int cpu)
16014 +{
16015 +       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16016 +}
16017 +
16018 +static int rcu_cpu_kthread_should_run(unsigned int cpu)
16019 +{
16020 +       return __this_cpu_read(rcu_cpu_has_work);
16021 +}
16022 +
16023 +/*
16024 + * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16025 + * RCU softirq used in flavors and configurations of RCU that do not
16026 + * support RCU priority boosting.
16027 + */
16028 +static void rcu_cpu_kthread(unsigned int cpu)
16029 +{
16030 +       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16031 +       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16032 +       int spincnt;
16033 +
16034 +       for (spincnt = 0; spincnt < 10; spincnt++) {
16035 +               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16036 +               local_bh_disable();
16037 +               *statusp = RCU_KTHREAD_RUNNING;
16038 +               this_cpu_inc(rcu_cpu_kthread_loops);
16039 +               local_irq_disable();
16040 +               work = *workp;
16041 +               *workp = 0;
16042 +               local_irq_enable();
16043 +               if (work)
16044 +                       rcu_process_callbacks();
16045 +               local_bh_enable();
16046 +               if (*workp == 0) {
16047 +                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16048 +                       *statusp = RCU_KTHREAD_WAITING;
16049 +                       return;
16050 +               }
16051 +       }
16052 +       *statusp = RCU_KTHREAD_YIELDING;
16053 +       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16054 +       schedule_timeout_interruptible(2);
16055 +       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16056 +       *statusp = RCU_KTHREAD_WAITING;
16057 +}
16058 +
16059 +static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16060 +       .store                  = &rcu_cpu_kthread_task,
16061 +       .thread_should_run      = rcu_cpu_kthread_should_run,
16062 +       .thread_fn              = rcu_cpu_kthread,
16063 +       .thread_comm            = "rcuc/%u",
16064 +       .setup                  = rcu_cpu_kthread_setup,
16065 +       .park                   = rcu_cpu_kthread_park,
16066 +};
16067 +
16068 +/*
16069 + * Spawn per-CPU RCU core processing kthreads.
16070 + */
16071 +static int __init rcu_spawn_core_kthreads(void)
16072 +{
16073 +       int cpu;
16074 +
16075 +       for_each_possible_cpu(cpu)
16076 +               per_cpu(rcu_cpu_has_work, cpu) = 0;
16077 +       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16078 +       return 0;
16079 +}
16080 +early_initcall(rcu_spawn_core_kthreads);
16081 +
16082  /*
16083   * Handle any core-RCU processing required by a call_rcu() invocation.
16084   */
16085 @@ -3192,6 +3311,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
16086  }
16087  EXPORT_SYMBOL_GPL(call_rcu_sched);
16088
16089 +#ifndef CONFIG_PREEMPT_RT_FULL
16090  /*
16091   * Queue an RCU callback for invocation after a quicker grace period.
16092   */
16093 @@ -3200,6 +3320,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
16094         __call_rcu(head, func, &rcu_bh_state, -1, 0);
16095  }
16096  EXPORT_SYMBOL_GPL(call_rcu_bh);
16097 +#endif
16098
16099  /*
16100   * Queue an RCU callback for lazy invocation after a grace period.
16101 @@ -3291,6 +3412,7 @@ void synchronize_sched(void)
16102  }
16103  EXPORT_SYMBOL_GPL(synchronize_sched);
16104
16105 +#ifndef CONFIG_PREEMPT_RT_FULL
16106  /**
16107   * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
16108   *
16109 @@ -3317,6 +3439,7 @@ void synchronize_rcu_bh(void)
16110                 wait_rcu_gp(call_rcu_bh);
16111  }
16112  EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
16113 +#endif
16114
16115  /**
16116   * get_state_synchronize_rcu - Snapshot current RCU state
16117 @@ -3695,6 +3818,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
16118         mutex_unlock(&rsp->barrier_mutex);
16119  }
16120
16121 +#ifndef CONFIG_PREEMPT_RT_FULL
16122  /**
16123   * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
16124   */
16125 @@ -3703,6 +3827,7 @@ void rcu_barrier_bh(void)
16126         _rcu_barrier(&rcu_bh_state);
16127  }
16128  EXPORT_SYMBOL_GPL(rcu_barrier_bh);
16129 +#endif
16130
16131  /**
16132   * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
16133 @@ -4196,12 +4321,13 @@ void __init rcu_init(void)
16134
16135         rcu_bootup_announce();
16136         rcu_init_geometry();
16137 +#ifndef CONFIG_PREEMPT_RT_FULL
16138         rcu_init_one(&rcu_bh_state);
16139 +#endif
16140         rcu_init_one(&rcu_sched_state);
16141         if (dump_tree)
16142                 rcu_dump_rcu_node_tree(&rcu_sched_state);
16143         __rcu_init_preempt();
16144 -       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
16145
16146         /*
16147          * We don't need protection against CPU-hotplug here because
16148 diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
16149 index f714f873bf9d..71631196e66e 100644
16150 --- a/kernel/rcu/tree.h
16151 +++ b/kernel/rcu/tree.h
16152 @@ -587,18 +587,18 @@ extern struct list_head rcu_struct_flavors;
16153   */
16154  extern struct rcu_state rcu_sched_state;
16155
16156 +#ifndef CONFIG_PREEMPT_RT_FULL
16157  extern struct rcu_state rcu_bh_state;
16158 +#endif
16159
16160  #ifdef CONFIG_PREEMPT_RCU
16161  extern struct rcu_state rcu_preempt_state;
16162  #endif /* #ifdef CONFIG_PREEMPT_RCU */
16163
16164 -#ifdef CONFIG_RCU_BOOST
16165  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16166  DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
16167  DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16168  DECLARE_PER_CPU(char, rcu_cpu_has_work);
16169 -#endif /* #ifdef CONFIG_RCU_BOOST */
16170
16171  #ifndef RCU_TREE_NONCORE
16172
16173 @@ -618,10 +618,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
16174  static void __init __rcu_init_preempt(void);
16175  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
16176  static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
16177 -static void invoke_rcu_callbacks_kthread(void);
16178  static bool rcu_is_callbacks_kthread(void);
16179 +static void rcu_cpu_kthread_setup(unsigned int cpu);
16180  #ifdef CONFIG_RCU_BOOST
16181 -static void rcu_preempt_do_callbacks(void);
16182  static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
16183                                                  struct rcu_node *rnp);
16184  #endif /* #ifdef CONFIG_RCU_BOOST */
16185 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
16186 index 0082fce402a0..e08cddadd9c7 100644
16187 --- a/kernel/rcu/tree_plugin.h
16188 +++ b/kernel/rcu/tree_plugin.h
16189 @@ -24,25 +24,10 @@
16190   *        Paul E. McKenney <paulmck@linux.vnet.ibm.com>
16191   */
16192
16193 -#include <linux/delay.h>
16194 -#include <linux/gfp.h>
16195 -#include <linux/oom.h>
16196 -#include <linux/smpboot.h>
16197 -#include "../time/tick-internal.h"
16198 -
16199  #ifdef CONFIG_RCU_BOOST
16200
16201  #include "../locking/rtmutex_common.h"
16202
16203 -/*
16204 - * Control variables for per-CPU and per-rcu_node kthreads.  These
16205 - * handle all flavors of RCU.
16206 - */
16207 -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
16208 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16209 -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16210 -DEFINE_PER_CPU(char, rcu_cpu_has_work);
16211 -
16212  #else /* #ifdef CONFIG_RCU_BOOST */
16213
16214  /*
16215 @@ -55,6 +40,14 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
16216
16217  #endif /* #else #ifdef CONFIG_RCU_BOOST */
16218
16219 +/*
16220 + * Control variables for per-CPU and per-rcu_node kthreads.  These
16221 + * handle all flavors of RCU.
16222 + */
16223 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
16224 +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
16225 +DEFINE_PER_CPU(char, rcu_cpu_has_work);
16226 +
16227  #ifdef CONFIG_RCU_NOCB_CPU
16228  static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
16229  static bool have_rcu_nocb_mask;            /* Was rcu_nocb_mask allocated? */
16230 @@ -426,7 +419,7 @@ void rcu_read_unlock_special(struct task_struct *t)
16231         }
16232
16233         /* Hardware IRQ handlers cannot block, complain if they get here. */
16234 -       if (in_irq() || in_serving_softirq()) {
16235 +       if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) {
16236                 lockdep_rcu_suspicious(__FILE__, __LINE__,
16237                                        "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
16238                 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
16239 @@ -632,15 +625,6 @@ static void rcu_preempt_check_callbacks(void)
16240                 t->rcu_read_unlock_special.b.need_qs = true;
16241  }
16242
16243 -#ifdef CONFIG_RCU_BOOST
16244 -
16245 -static void rcu_preempt_do_callbacks(void)
16246 -{
16247 -       rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p));
16248 -}
16249 -
16250 -#endif /* #ifdef CONFIG_RCU_BOOST */
16251 -
16252  /*
16253   * Queue a preemptible-RCU callback for invocation after a grace period.
16254   */
16255 @@ -829,6 +813,19 @@ void exit_rcu(void)
16256
16257  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
16258
16259 +/*
16260 + * If boosting, set rcuc kthreads to realtime priority.
16261 + */
16262 +static void rcu_cpu_kthread_setup(unsigned int cpu)
16263 +{
16264 +#ifdef CONFIG_RCU_BOOST
16265 +       struct sched_param sp;
16266 +
16267 +       sp.sched_priority = kthread_prio;
16268 +       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
16269 +#endif /* #ifdef CONFIG_RCU_BOOST */
16270 +}
16271 +
16272  #ifdef CONFIG_RCU_BOOST
16273
16274  #include "../locking/rtmutex_common.h"
16275 @@ -860,16 +857,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
16276
16277  #endif /* #else #ifdef CONFIG_RCU_TRACE */
16278
16279 -static void rcu_wake_cond(struct task_struct *t, int status)
16280 -{
16281 -       /*
16282 -        * If the thread is yielding, only wake it when this
16283 -        * is invoked from idle
16284 -        */
16285 -       if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
16286 -               wake_up_process(t);
16287 -}
16288 -
16289  /*
16290   * Carry out RCU priority boosting on the task indicated by ->exp_tasks
16291   * or ->boost_tasks, advancing the pointer to the next task in the
16292 @@ -1013,23 +1000,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
16293  }
16294
16295  /*
16296 - * Wake up the per-CPU kthread to invoke RCU callbacks.
16297 - */
16298 -static void invoke_rcu_callbacks_kthread(void)
16299 -{
16300 -       unsigned long flags;
16301 -
16302 -       local_irq_save(flags);
16303 -       __this_cpu_write(rcu_cpu_has_work, 1);
16304 -       if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
16305 -           current != __this_cpu_read(rcu_cpu_kthread_task)) {
16306 -               rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
16307 -                             __this_cpu_read(rcu_cpu_kthread_status));
16308 -       }
16309 -       local_irq_restore(flags);
16310 -}
16311 -
16312 -/*
16313   * Is the current CPU running the RCU-callbacks kthread?
16314   * Caller must have preemption disabled.
16315   */
16316 @@ -1083,67 +1053,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
16317         return 0;
16318  }
16319
16320 -static void rcu_kthread_do_work(void)
16321 -{
16322 -       rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
16323 -       rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
16324 -       rcu_preempt_do_callbacks();
16325 -}
16326 -
16327 -static void rcu_cpu_kthread_setup(unsigned int cpu)
16328 -{
16329 -       struct sched_param sp;
16330 -
16331 -       sp.sched_priority = kthread_prio;
16332 -       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
16333 -}
16334 -
16335 -static void rcu_cpu_kthread_park(unsigned int cpu)
16336 -{
16337 -       per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
16338 -}
16339 -
16340 -static int rcu_cpu_kthread_should_run(unsigned int cpu)
16341 -{
16342 -       return __this_cpu_read(rcu_cpu_has_work);
16343 -}
16344 -
16345 -/*
16346 - * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
16347 - * RCU softirq used in flavors and configurations of RCU that do not
16348 - * support RCU priority boosting.
16349 - */
16350 -static void rcu_cpu_kthread(unsigned int cpu)
16351 -{
16352 -       unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
16353 -       char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
16354 -       int spincnt;
16355 -
16356 -       for (spincnt = 0; spincnt < 10; spincnt++) {
16357 -               trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
16358 -               local_bh_disable();
16359 -               *statusp = RCU_KTHREAD_RUNNING;
16360 -               this_cpu_inc(rcu_cpu_kthread_loops);
16361 -               local_irq_disable();
16362 -               work = *workp;
16363 -               *workp = 0;
16364 -               local_irq_enable();
16365 -               if (work)
16366 -                       rcu_kthread_do_work();
16367 -               local_bh_enable();
16368 -               if (*workp == 0) {
16369 -                       trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
16370 -                       *statusp = RCU_KTHREAD_WAITING;
16371 -                       return;
16372 -               }
16373 -       }
16374 -       *statusp = RCU_KTHREAD_YIELDING;
16375 -       trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
16376 -       schedule_timeout_interruptible(2);
16377 -       trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
16378 -       *statusp = RCU_KTHREAD_WAITING;
16379 -}
16380 -
16381  /*
16382   * Set the per-rcu_node kthread's affinity to cover all CPUs that are
16383   * served by the rcu_node in question.  The CPU hotplug lock is still
16384 @@ -1174,26 +1083,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
16385         free_cpumask_var(cm);
16386  }
16387
16388 -static struct smp_hotplug_thread rcu_cpu_thread_spec = {
16389 -       .store                  = &rcu_cpu_kthread_task,
16390 -       .thread_should_run      = rcu_cpu_kthread_should_run,
16391 -       .thread_fn              = rcu_cpu_kthread,
16392 -       .thread_comm            = "rcuc/%u",
16393 -       .setup                  = rcu_cpu_kthread_setup,
16394 -       .park                   = rcu_cpu_kthread_park,
16395 -};
16396 -
16397  /*
16398   * Spawn boost kthreads -- called as soon as the scheduler is running.
16399   */
16400  static void __init rcu_spawn_boost_kthreads(void)
16401  {
16402         struct rcu_node *rnp;
16403 -       int cpu;
16404 -
16405 -       for_each_possible_cpu(cpu)
16406 -               per_cpu(rcu_cpu_has_work, cpu) = 0;
16407 -       BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
16408         rcu_for_each_leaf_node(rcu_state_p, rnp)
16409                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
16410  }
16411 @@ -1216,11 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
16412         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
16413  }
16414
16415 -static void invoke_rcu_callbacks_kthread(void)
16416 -{
16417 -       WARN_ON_ONCE(1);
16418 -}
16419 -
16420  static bool rcu_is_callbacks_kthread(void)
16421  {
16422         return false;
16423 @@ -1244,7 +1134,7 @@ static void rcu_prepare_kthreads(int cpu)
16424
16425  #endif /* #else #ifdef CONFIG_RCU_BOOST */
16426
16427 -#if !defined(CONFIG_RCU_FAST_NO_HZ)
16428 +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL)
16429
16430  /*
16431   * Check to see if any future RCU-related work will need to be done
16432 @@ -1261,7 +1151,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
16433         return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
16434                ? 0 : rcu_cpu_has_callbacks(NULL);
16435  }
16436 +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */
16437
16438 +#if !defined(CONFIG_RCU_FAST_NO_HZ)
16439  /*
16440   * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
16441   * after it.
16442 @@ -1357,6 +1249,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
16443         return cbs_ready;
16444  }
16445
16446 +#ifndef CONFIG_PREEMPT_RT_FULL
16447 +
16448  /*
16449   * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
16450   * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
16451 @@ -1402,6 +1296,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
16452         *nextevt = basemono + dj * TICK_NSEC;
16453         return 0;
16454  }
16455 +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */
16456
16457  /*
16458   * Prepare a CPU for idle from an RCU perspective.  The first major task
16459 diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
16460 index f0d8322bc3ec..b40d3468ba4e 100644
16461 --- a/kernel/rcu/update.c
16462 +++ b/kernel/rcu/update.c
16463 @@ -295,6 +295,7 @@ int rcu_read_lock_held(void)
16464  }
16465  EXPORT_SYMBOL_GPL(rcu_read_lock_held);
16466
16467 +#ifndef CONFIG_PREEMPT_RT_FULL
16468  /**
16469   * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
16470   *
16471 @@ -321,6 +322,7 @@ int rcu_read_lock_bh_held(void)
16472         return in_softirq() || irqs_disabled();
16473  }
16474  EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
16475 +#endif
16476
16477  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
16478
16479 diff --git a/kernel/relay.c b/kernel/relay.c
16480 index d797502140b9..cf05c17ddbed 100644
16481 --- a/kernel/relay.c
16482 +++ b/kernel/relay.c
16483 @@ -336,6 +336,10 @@ static void wakeup_readers(unsigned long data)
16484  {
16485         struct rchan_buf *buf = (struct rchan_buf *)data;
16486         wake_up_interruptible(&buf->read_wait);
16487 +       /*
16488 +        * Stupid polling for now:
16489 +        */
16490 +       mod_timer(&buf->timer, jiffies + 1);
16491  }
16492
16493  /**
16494 @@ -353,6 +357,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
16495                 init_waitqueue_head(&buf->read_wait);
16496                 kref_init(&buf->kref);
16497                 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
16498 +               mod_timer(&buf->timer, jiffies + 1);
16499         } else
16500                 del_timer_sync(&buf->timer);
16501
16502 @@ -767,15 +772,6 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
16503                 else
16504                         buf->early_bytes += buf->chan->subbuf_size -
16505                                             buf->padding[old_subbuf];
16506 -               smp_mb();
16507 -               if (waitqueue_active(&buf->read_wait))
16508 -                       /*
16509 -                        * Calling wake_up_interruptible() from here
16510 -                        * will deadlock if we happen to be logging
16511 -                        * from the scheduler (trying to re-grab
16512 -                        * rq->lock), so defer it.
16513 -                        */
16514 -                       mod_timer(&buf->timer, jiffies + 1);
16515         }
16516
16517         old = buf->data;
16518 diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
16519 index 5e59b832ae2b..7337a7f60e3f 100644
16520 --- a/kernel/sched/Makefile
16521 +++ b/kernel/sched/Makefile
16522 @@ -17,7 +17,7 @@ endif
16523
16524  obj-y += core.o loadavg.o clock.o cputime.o
16525  obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
16526 -obj-y += wait.o swait.o completion.o idle.o
16527 +obj-y += wait.o swait.o swork.o completion.o idle.o
16528  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
16529  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
16530  obj-$(CONFIG_SCHEDSTATS) += stats.o
16531 diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
16532 index 8d0f35debf35..b62cf6400fe0 100644
16533 --- a/kernel/sched/completion.c
16534 +++ b/kernel/sched/completion.c
16535 @@ -30,10 +30,10 @@ void complete(struct completion *x)
16536  {
16537         unsigned long flags;
16538
16539 -       spin_lock_irqsave(&x->wait.lock, flags);
16540 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16541         x->done++;
16542 -       __wake_up_locked(&x->wait, TASK_NORMAL, 1);
16543 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16544 +       swake_up_locked(&x->wait);
16545 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16546  }
16547  EXPORT_SYMBOL(complete);
16548
16549 @@ -50,10 +50,10 @@ void complete_all(struct completion *x)
16550  {
16551         unsigned long flags;
16552
16553 -       spin_lock_irqsave(&x->wait.lock, flags);
16554 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16555         x->done += UINT_MAX/2;
16556 -       __wake_up_locked(&x->wait, TASK_NORMAL, 0);
16557 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16558 +       swake_up_all_locked(&x->wait);
16559 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16560  }
16561  EXPORT_SYMBOL(complete_all);
16562
16563 @@ -62,20 +62,20 @@ do_wait_for_common(struct completion *x,
16564                    long (*action)(long), long timeout, int state)
16565  {
16566         if (!x->done) {
16567 -               DECLARE_WAITQUEUE(wait, current);
16568 +               DECLARE_SWAITQUEUE(wait);
16569
16570 -               __add_wait_queue_tail_exclusive(&x->wait, &wait);
16571 +               __prepare_to_swait(&x->wait, &wait);
16572                 do {
16573                         if (signal_pending_state(state, current)) {
16574                                 timeout = -ERESTARTSYS;
16575                                 break;
16576                         }
16577                         __set_current_state(state);
16578 -                       spin_unlock_irq(&x->wait.lock);
16579 +                       raw_spin_unlock_irq(&x->wait.lock);
16580                         timeout = action(timeout);
16581 -                       spin_lock_irq(&x->wait.lock);
16582 +                       raw_spin_lock_irq(&x->wait.lock);
16583                 } while (!x->done && timeout);
16584 -               __remove_wait_queue(&x->wait, &wait);
16585 +               __finish_swait(&x->wait, &wait);
16586                 if (!x->done)
16587                         return timeout;
16588         }
16589 @@ -89,9 +89,9 @@ __wait_for_common(struct completion *x,
16590  {
16591         might_sleep();
16592
16593 -       spin_lock_irq(&x->wait.lock);
16594 +       raw_spin_lock_irq(&x->wait.lock);
16595         timeout = do_wait_for_common(x, action, timeout, state);
16596 -       spin_unlock_irq(&x->wait.lock);
16597 +       raw_spin_unlock_irq(&x->wait.lock);
16598         return timeout;
16599  }
16600
16601 @@ -277,12 +277,12 @@ bool try_wait_for_completion(struct completion *x)
16602         if (!READ_ONCE(x->done))
16603                 return 0;
16604
16605 -       spin_lock_irqsave(&x->wait.lock, flags);
16606 +       raw_spin_lock_irqsave(&x->wait.lock, flags);
16607         if (!x->done)
16608                 ret = 0;
16609         else
16610                 x->done--;
16611 -       spin_unlock_irqrestore(&x->wait.lock, flags);
16612 +       raw_spin_unlock_irqrestore(&x->wait.lock, flags);
16613         return ret;
16614  }
16615  EXPORT_SYMBOL(try_wait_for_completion);
16616 @@ -311,7 +311,7 @@ bool completion_done(struct completion *x)
16617          * after it's acquired the lock.
16618          */
16619         smp_rmb();
16620 -       spin_unlock_wait(&x->wait.lock);
16621 +       raw_spin_unlock_wait(&x->wait.lock);
16622         return true;
16623  }
16624  EXPORT_SYMBOL(completion_done);
16625 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
16626 index 44817c640e99..55aafcff5810 100644
16627 --- a/kernel/sched/core.c
16628 +++ b/kernel/sched/core.c
16629 @@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
16630   * Number of tasks to iterate in a single balance run.
16631   * Limited because this is done with IRQs disabled.
16632   */
16633 +#ifndef CONFIG_PREEMPT_RT_FULL
16634  const_debug unsigned int sysctl_sched_nr_migrate = 32;
16635 +#else
16636 +const_debug unsigned int sysctl_sched_nr_migrate = 8;
16637 +#endif
16638
16639  /*
16640   * period over which we average the RT time consumption, measured
16641 @@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
16642
16643         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
16644         rq->hrtick_timer.function = hrtick;
16645 +       rq->hrtick_timer.irqsafe = 1;
16646  }
16647  #else  /* CONFIG_SCHED_HRTICK */
16648  static inline void hrtick_clear(struct rq *rq)
16649 @@ -449,7 +454,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
16650         head->lastp = &node->next;
16651  }
16652
16653 -void wake_up_q(struct wake_q_head *head)
16654 +void __wake_up_q(struct wake_q_head *head, bool sleeper)
16655  {
16656         struct wake_q_node *node = head->first;
16657
16658 @@ -466,7 +471,10 @@ void wake_up_q(struct wake_q_head *head)
16659                  * wake_up_process() implies a wmb() to pair with the queueing
16660                  * in wake_q_add() so as not to miss wakeups.
16661                  */
16662 -               wake_up_process(task);
16663 +               if (sleeper)
16664 +                       wake_up_lock_sleeper(task);
16665 +               else
16666 +                       wake_up_process(task);
16667                 put_task_struct(task);
16668         }
16669  }
16670 @@ -502,6 +510,38 @@ void resched_curr(struct rq *rq)
16671                 trace_sched_wake_idle_without_ipi(cpu);
16672  }
16673
16674 +#ifdef CONFIG_PREEMPT_LAZY
16675 +void resched_curr_lazy(struct rq *rq)
16676 +{
16677 +       struct task_struct *curr = rq->curr;
16678 +       int cpu;
16679 +
16680 +       if (!sched_feat(PREEMPT_LAZY)) {
16681 +               resched_curr(rq);
16682 +               return;
16683 +       }
16684 +
16685 +       lockdep_assert_held(&rq->lock);
16686 +
16687 +       if (test_tsk_need_resched(curr))
16688 +               return;
16689 +
16690 +       if (test_tsk_need_resched_lazy(curr))
16691 +               return;
16692 +
16693 +       set_tsk_need_resched_lazy(curr);
16694 +
16695 +       cpu = cpu_of(rq);
16696 +       if (cpu == smp_processor_id())
16697 +               return;
16698 +
16699 +       /* NEED_RESCHED_LAZY must be visible before we test polling */
16700 +       smp_mb();
16701 +       if (!tsk_is_polling(curr))
16702 +               smp_send_reschedule(cpu);
16703 +}
16704 +#endif
16705 +
16706  void resched_cpu(int cpu)
16707  {
16708         struct rq *rq = cpu_rq(cpu);
16709 @@ -525,11 +565,14 @@ void resched_cpu(int cpu)
16710   */
16711  int get_nohz_timer_target(void)
16712  {
16713 -       int i, cpu = smp_processor_id();
16714 +       int i, cpu;
16715         struct sched_domain *sd;
16716
16717 +       preempt_disable_rt();
16718 +       cpu = smp_processor_id();
16719 +
16720         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
16721 -               return cpu;
16722 +               goto preempt_en_rt;
16723
16724         rcu_read_lock();
16725         for_each_domain(cpu, sd) {
16726 @@ -548,6 +591,8 @@ int get_nohz_timer_target(void)
16727                 cpu = housekeeping_any_cpu();
16728  unlock:
16729         rcu_read_unlock();
16730 +preempt_en_rt:
16731 +       preempt_enable_rt();
16732         return cpu;
16733  }
16734  /*
16735 @@ -1089,6 +1134,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
16736
16737         lockdep_assert_held(&p->pi_lock);
16738
16739 +       if (__migrate_disabled(p)) {
16740 +               cpumask_copy(&p->cpus_allowed, new_mask);
16741 +               return;
16742 +       }
16743 +
16744         queued = task_on_rq_queued(p);
16745         running = task_current(rq, p);
16746
16747 @@ -1111,6 +1161,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
16748                 enqueue_task(rq, p, ENQUEUE_RESTORE);
16749  }
16750
16751 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
16752 +static DEFINE_MUTEX(sched_down_mutex);
16753 +static cpumask_t sched_down_cpumask;
16754 +
16755 +void tell_sched_cpu_down_begin(int cpu)
16756 +{
16757 +       mutex_lock(&sched_down_mutex);
16758 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
16759 +       mutex_unlock(&sched_down_mutex);
16760 +}
16761 +
16762 +void tell_sched_cpu_down_done(int cpu)
16763 +{
16764 +       mutex_lock(&sched_down_mutex);
16765 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
16766 +       mutex_unlock(&sched_down_mutex);
16767 +}
16768 +
16769 +/**
16770 + * migrate_me - try to move the current task off this cpu
16771 + *
16772 + * Used by the pin_current_cpu() code to try to get tasks
16773 + * to move off the current CPU as it is going down.
16774 + * It will only move the task if the task isn't pinned to
16775 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
16776 + * and the task has to be in a RUNNING state. Otherwise the
16777 + * movement of the task will wake it up (change its state
16778 + * to running) when the task did not expect it.
16779 + *
16780 + * Returns 1 if it succeeded in moving the current task
16781 + *         0 otherwise.
16782 + */
16783 +int migrate_me(void)
16784 +{
16785 +       struct task_struct *p = current;
16786 +       struct migration_arg arg;
16787 +       struct cpumask *cpumask;
16788 +       struct cpumask *mask;
16789 +       unsigned int dest_cpu;
16790 +       struct rq_flags rf;
16791 +       struct rq *rq;
16792 +
16793 +       /*
16794 +        * We can not migrate tasks bounded to a CPU or tasks not
16795 +        * running. The movement of the task will wake it up.
16796 +        */
16797 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
16798 +               return 0;
16799 +
16800 +       mutex_lock(&sched_down_mutex);
16801 +       rq = task_rq_lock(p, &rf);
16802 +
16803 +       cpumask = this_cpu_ptr(&sched_cpumasks);
16804 +       mask = &p->cpus_allowed;
16805 +
16806 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
16807 +
16808 +       if (!cpumask_weight(cpumask)) {
16809 +               /* It's only on this CPU? */
16810 +               task_rq_unlock(rq, p, &rf);
16811 +               mutex_unlock(&sched_down_mutex);
16812 +               return 0;
16813 +       }
16814 +
16815 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
16816 +
16817 +       arg.task = p;
16818 +       arg.dest_cpu = dest_cpu;
16819 +
16820 +       task_rq_unlock(rq, p, &rf);
16821 +
16822 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
16823 +       tlb_migrate_finish(p->mm);
16824 +       mutex_unlock(&sched_down_mutex);
16825 +
16826 +       return 1;
16827 +}
16828 +
16829  /*
16830   * Change a given task's CPU affinity. Migrate the thread to a
16831   * proper CPU and schedule it away if the CPU it's executing on
16832 @@ -1168,7 +1296,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
16833         }
16834
16835         /* Can the task run on the task's current CPU? If so, we're done */
16836 -       if (cpumask_test_cpu(task_cpu(p), new_mask))
16837 +       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
16838                 goto out;
16839
16840         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
16841 @@ -1355,6 +1483,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
16842         return ret;
16843  }
16844
16845 +static bool check_task_state(struct task_struct *p, long match_state)
16846 +{
16847 +       bool match = false;
16848 +
16849 +       raw_spin_lock_irq(&p->pi_lock);
16850 +       if (p->state == match_state || p->saved_state == match_state)
16851 +               match = true;
16852 +       raw_spin_unlock_irq(&p->pi_lock);
16853 +
16854 +       return match;
16855 +}
16856 +
16857  /*
16858   * wait_task_inactive - wait for a thread to unschedule.
16859   *
16860 @@ -1399,7 +1539,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
16861                  * is actually now running somewhere else!
16862                  */
16863                 while (task_running(rq, p)) {
16864 -                       if (match_state && unlikely(p->state != match_state))
16865 +                       if (match_state && !check_task_state(p, match_state))
16866                                 return 0;
16867                         cpu_relax();
16868                 }
16869 @@ -1414,7 +1554,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
16870                 running = task_running(rq, p);
16871                 queued = task_on_rq_queued(p);
16872                 ncsw = 0;
16873 -               if (!match_state || p->state == match_state)
16874 +               if (!match_state || p->state == match_state ||
16875 +                   p->saved_state == match_state)
16876                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
16877                 task_rq_unlock(rq, p, &rf);
16878
16879 @@ -1670,10 +1811,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
16880  {
16881         activate_task(rq, p, en_flags);
16882         p->on_rq = TASK_ON_RQ_QUEUED;
16883 -
16884 -       /* if a worker is waking up, notify workqueue */
16885 -       if (p->flags & PF_WQ_WORKER)
16886 -               wq_worker_waking_up(p, cpu_of(rq));
16887  }
16888
16889  /*
16890 @@ -2008,8 +2145,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
16891          */
16892         smp_mb__before_spinlock();
16893         raw_spin_lock_irqsave(&p->pi_lock, flags);
16894 -       if (!(p->state & state))
16895 +       if (!(p->state & state)) {
16896 +               /*
16897 +                * The task might be running due to a spinlock sleeper
16898 +                * wakeup. Check the saved state and set it to running
16899 +                * if the wakeup condition is true.
16900 +                */
16901 +               if (!(wake_flags & WF_LOCK_SLEEPER)) {
16902 +                       if (p->saved_state & state) {
16903 +                               p->saved_state = TASK_RUNNING;
16904 +                               success = 1;
16905 +                       }
16906 +               }
16907                 goto out;
16908 +       }
16909 +
16910 +       /*
16911 +        * If this is a regular wakeup, then we can unconditionally
16912 +        * clear the saved state of a "lock sleeper".
16913 +        */
16914 +       if (!(wake_flags & WF_LOCK_SLEEPER))
16915 +               p->saved_state = TASK_RUNNING;
16916
16917         trace_sched_waking(p);
16918
16919 @@ -2093,53 +2249,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
16920  }
16921
16922  /**
16923 - * try_to_wake_up_local - try to wake up a local task with rq lock held
16924 - * @p: the thread to be awakened
16925 - *
16926 - * Put @p on the run-queue if it's not already there. The caller must
16927 - * ensure that this_rq() is locked, @p is bound to this_rq() and not
16928 - * the current task.
16929 - */
16930 -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
16931 -{
16932 -       struct rq *rq = task_rq(p);
16933 -
16934 -       if (WARN_ON_ONCE(rq != this_rq()) ||
16935 -           WARN_ON_ONCE(p == current))
16936 -               return;
16937 -
16938 -       lockdep_assert_held(&rq->lock);
16939 -
16940 -       if (!raw_spin_trylock(&p->pi_lock)) {
16941 -               /*
16942 -                * This is OK, because current is on_cpu, which avoids it being
16943 -                * picked for load-balance and preemption/IRQs are still
16944 -                * disabled avoiding further scheduler activity on it and we've
16945 -                * not yet picked a replacement task.
16946 -                */
16947 -               lockdep_unpin_lock(&rq->lock, cookie);
16948 -               raw_spin_unlock(&rq->lock);
16949 -               raw_spin_lock(&p->pi_lock);
16950 -               raw_spin_lock(&rq->lock);
16951 -               lockdep_repin_lock(&rq->lock, cookie);
16952 -       }
16953 -
16954 -       if (!(p->state & TASK_NORMAL))
16955 -               goto out;
16956 -
16957 -       trace_sched_waking(p);
16958 -
16959 -       if (!task_on_rq_queued(p))
16960 -               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
16961 -
16962 -       ttwu_do_wakeup(rq, p, 0, cookie);
16963 -       if (schedstat_enabled())
16964 -               ttwu_stat(p, smp_processor_id(), 0);
16965 -out:
16966 -       raw_spin_unlock(&p->pi_lock);
16967 -}
16968 -
16969 -/**
16970   * wake_up_process - Wake up a specific process
16971   * @p: The process to be woken up.
16972   *
16973 @@ -2157,6 +2266,18 @@ int wake_up_process(struct task_struct *p)
16974  }
16975  EXPORT_SYMBOL(wake_up_process);
16976
16977 +/**
16978 + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
16979 + * @p: The process to be woken up.
16980 + *
16981 + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
16982 + * the nature of the wakeup.
16983 + */
16984 +int wake_up_lock_sleeper(struct task_struct *p)
16985 +{
16986 +       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
16987 +}
16988 +
16989  int wake_up_state(struct task_struct *p, unsigned int state)
16990  {
16991         return try_to_wake_up(p, state, 0);
16992 @@ -2433,6 +2554,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
16993         p->on_cpu = 0;
16994  #endif
16995         init_task_preempt_count(p);
16996 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
16997 +       task_thread_info(p)->preempt_lazy_count = 0;
16998 +#endif
16999  #ifdef CONFIG_SMP
17000         plist_node_init(&p->pushable_tasks, MAX_PRIO);
17001         RB_CLEAR_NODE(&p->pushable_dl_tasks);
17002 @@ -2761,8 +2885,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
17003         finish_arch_post_lock_switch();
17004
17005         fire_sched_in_preempt_notifiers(current);
17006 +       /*
17007 +        * We use mmdrop_delayed() here so we don't have to do the
17008 +        * full __mmdrop() when we are the last user.
17009 +        */
17010         if (mm)
17011 -               mmdrop(mm);
17012 +               mmdrop_delayed(mm);
17013         if (unlikely(prev_state == TASK_DEAD)) {
17014                 if (prev->sched_class->task_dead)
17015                         prev->sched_class->task_dead(prev);
17016 @@ -3237,6 +3365,77 @@ static inline void schedule_debug(struct task_struct *prev)
17017         schedstat_inc(this_rq(), sched_count);
17018  }
17019
17020 +#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
17021 +
17022 +void migrate_disable(void)
17023 +{
17024 +       struct task_struct *p = current;
17025 +
17026 +       if (in_atomic() || irqs_disabled()) {
17027 +#ifdef CONFIG_SCHED_DEBUG
17028 +               p->migrate_disable_atomic++;
17029 +#endif
17030 +               return;
17031 +       }
17032 +
17033 +#ifdef CONFIG_SCHED_DEBUG
17034 +       if (unlikely(p->migrate_disable_atomic)) {
17035 +               tracing_off();
17036 +               WARN_ON_ONCE(1);
17037 +       }
17038 +#endif
17039 +
17040 +       if (p->migrate_disable) {
17041 +               p->migrate_disable++;
17042 +               return;
17043 +       }
17044 +
17045 +       preempt_disable();
17046 +       preempt_lazy_disable();
17047 +       pin_current_cpu();
17048 +       p->migrate_disable = 1;
17049 +       preempt_enable();
17050 +}
17051 +EXPORT_SYMBOL(migrate_disable);
17052 +
17053 +void migrate_enable(void)
17054 +{
17055 +       struct task_struct *p = current;
17056 +
17057 +       if (in_atomic() || irqs_disabled()) {
17058 +#ifdef CONFIG_SCHED_DEBUG
17059 +               p->migrate_disable_atomic--;
17060 +#endif
17061 +               return;
17062 +       }
17063 +
17064 +#ifdef CONFIG_SCHED_DEBUG
17065 +       if (unlikely(p->migrate_disable_atomic)) {
17066 +               tracing_off();
17067 +               WARN_ON_ONCE(1);
17068 +       }
17069 +#endif
17070 +       WARN_ON_ONCE(p->migrate_disable <= 0);
17071 +
17072 +       if (p->migrate_disable > 1) {
17073 +               p->migrate_disable--;
17074 +               return;
17075 +       }
17076 +
17077 +       preempt_disable();
17078 +       /*
17079 +        * Clearing migrate_disable causes tsk_cpus_allowed to
17080 +        * show the tasks original cpu affinity.
17081 +        */
17082 +       p->migrate_disable = 0;
17083 +
17084 +       unpin_current_cpu();
17085 +       preempt_enable();
17086 +       preempt_lazy_enable();
17087 +}
17088 +EXPORT_SYMBOL(migrate_enable);
17089 +#endif
17090 +
17091  /*
17092   * Pick up the highest-prio task:
17093   */
17094 @@ -3364,19 +3563,6 @@ static void __sched notrace __schedule(bool preempt)
17095                 } else {
17096                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
17097                         prev->on_rq = 0;
17098 -
17099 -                       /*
17100 -                        * If a worker went to sleep, notify and ask workqueue
17101 -                        * whether it wants to wake up a task to maintain
17102 -                        * concurrency.
17103 -                        */
17104 -                       if (prev->flags & PF_WQ_WORKER) {
17105 -                               struct task_struct *to_wakeup;
17106 -
17107 -                               to_wakeup = wq_worker_sleeping(prev);
17108 -                               if (to_wakeup)
17109 -                                       try_to_wake_up_local(to_wakeup, cookie);
17110 -                       }
17111                 }
17112                 switch_count = &prev->nvcsw;
17113         }
17114 @@ -3386,6 +3572,7 @@ static void __sched notrace __schedule(bool preempt)
17115
17116         next = pick_next_task(rq, prev, cookie);
17117         clear_tsk_need_resched(prev);
17118 +       clear_tsk_need_resched_lazy(prev);
17119         clear_preempt_need_resched();
17120         rq->clock_skip_update = 0;
17121
17122 @@ -3407,9 +3594,20 @@ STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
17123
17124  static inline void sched_submit_work(struct task_struct *tsk)
17125  {
17126 -       if (!tsk->state || tsk_is_pi_blocked(tsk))
17127 +       if (!tsk->state)
17128                 return;
17129         /*
17130 +        * If a worker went to sleep, notify and ask workqueue whether
17131 +        * it wants to wake up a task to maintain concurrency.
17132 +        */
17133 +       if (tsk->flags & PF_WQ_WORKER)
17134 +               wq_worker_sleeping(tsk);
17135 +
17136 +
17137 +       if (tsk_is_pi_blocked(tsk))
17138 +               return;
17139 +
17140 +       /*
17141          * If we are going to sleep and we have plugged IO queued,
17142          * make sure to submit it to avoid deadlocks.
17143          */
17144 @@ -3417,6 +3615,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
17145                 blk_schedule_flush_plug(tsk);
17146  }
17147
17148 +static void sched_update_worker(struct task_struct *tsk)
17149 +{
17150 +       if (tsk->flags & PF_WQ_WORKER)
17151 +               wq_worker_running(tsk);
17152 +}
17153 +
17154  asmlinkage __visible void __sched schedule(void)
17155  {
17156         struct task_struct *tsk = current;
17157 @@ -3427,6 +3631,7 @@ asmlinkage __visible void __sched schedule(void)
17158                 __schedule(false);
17159                 sched_preempt_enable_no_resched();
17160         } while (need_resched());
17161 +       sched_update_worker(tsk);
17162  }
17163  EXPORT_SYMBOL(schedule);
17164
17165 @@ -3490,6 +3695,30 @@ static void __sched notrace preempt_schedule_common(void)
17166         } while (need_resched());
17167  }
17168
17169 +#ifdef CONFIG_PREEMPT_LAZY
17170 +/*
17171 + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
17172 + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
17173 + * preempt_lazy_count counter >0.
17174 + */
17175 +static __always_inline int preemptible_lazy(void)
17176 +{
17177 +       if (test_thread_flag(TIF_NEED_RESCHED))
17178 +               return 1;
17179 +       if (current_thread_info()->preempt_lazy_count)
17180 +               return 0;
17181 +       return 1;
17182 +}
17183 +
17184 +#else
17185 +
17186 +static inline int preemptible_lazy(void)
17187 +{
17188 +       return 1;
17189 +}
17190 +
17191 +#endif
17192 +
17193  #ifdef CONFIG_PREEMPT
17194  /*
17195   * this is the entry point to schedule() from in-kernel preemption
17196 @@ -3504,7 +3733,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
17197          */
17198         if (likely(!preemptible()))
17199                 return;
17200 -
17201 +       if (!preemptible_lazy())
17202 +               return;
17203         preempt_schedule_common();
17204  }
17205  NOKPROBE_SYMBOL(preempt_schedule);
17206 @@ -3531,6 +3761,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
17207         if (likely(!preemptible()))
17208                 return;
17209
17210 +       if (!preemptible_lazy())
17211 +               return;
17212 +
17213         do {
17214                 /*
17215                  * Because the function tracer can trace preempt_count_sub()
17216 @@ -3553,7 +3786,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
17217                  * an infinite recursion.
17218                  */
17219                 prev_ctx = exception_enter();
17220 +               /*
17221 +                * The add/subtract must not be traced by the function
17222 +                * tracer. But we still want to account for the
17223 +                * preempt off latency tracer. Since the _notrace versions
17224 +                * of add/subtract skip the accounting for latency tracer
17225 +                * we must force it manually.
17226 +                */
17227 +               start_critical_timings();
17228                 __schedule(true);
17229 +               stop_critical_timings();
17230                 exception_exit(prev_ctx);
17231
17232                 preempt_latency_stop(1);
17233 @@ -4901,6 +5143,7 @@ int __cond_resched_lock(spinlock_t *lock)
17234  }
17235  EXPORT_SYMBOL(__cond_resched_lock);
17236
17237 +#ifndef CONFIG_PREEMPT_RT_FULL
17238  int __sched __cond_resched_softirq(void)
17239  {
17240         BUG_ON(!in_softirq());
17241 @@ -4914,6 +5157,7 @@ int __sched __cond_resched_softirq(void)
17242         return 0;
17243  }
17244  EXPORT_SYMBOL(__cond_resched_softirq);
17245 +#endif
17246
17247  /**
17248   * yield - yield the current processor to other threads.
17249 @@ -5283,7 +5527,9 @@ void init_idle(struct task_struct *idle, int cpu)
17250
17251         /* Set the preempt count _outside_ the spinlocks! */
17252         init_idle_preempt_count(idle, cpu);
17253 -
17254 +#ifdef CONFIG_HAVE_PREEMPT_LAZY
17255 +       task_thread_info(idle)->preempt_lazy_count = 0;
17256 +#endif
17257         /*
17258          * The idle tasks have their own, simple scheduling class:
17259          */
17260 @@ -5426,6 +5672,8 @@ void sched_setnuma(struct task_struct *p, int nid)
17261  #endif /* CONFIG_NUMA_BALANCING */
17262
17263  #ifdef CONFIG_HOTPLUG_CPU
17264 +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
17265 +
17266  /*
17267   * Ensures that the idle task is using init_mm right before its cpu goes
17268   * offline.
17269 @@ -5440,7 +5688,12 @@ void idle_task_exit(void)
17270                 switch_mm_irqs_off(mm, &init_mm, current);
17271                 finish_arch_post_lock_switch();
17272         }
17273 -       mmdrop(mm);
17274 +       /*
17275 +        * Defer the cleanup to an alive cpu. On RT we can neither
17276 +        * call mmdrop() nor mmdrop_delayed() from here.
17277 +        */
17278 +       per_cpu(idle_last_mm, smp_processor_id()) = mm;
17279 +
17280  }
17281
17282  /*
17283 @@ -7315,6 +7568,10 @@ int sched_cpu_dying(unsigned int cpu)
17284         update_max_interval();
17285         nohz_balance_exit_idle(cpu);
17286         hrtick_clear(rq);
17287 +       if (per_cpu(idle_last_mm, cpu)) {
17288 +               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
17289 +               per_cpu(idle_last_mm, cpu) = NULL;
17290 +       }
17291         return 0;
17292  }
17293  #endif
17294 @@ -7566,7 +7823,7 @@ void __init sched_init(void)
17295  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
17296  static inline int preempt_count_equals(int preempt_offset)
17297  {
17298 -       int nested = preempt_count() + rcu_preempt_depth();
17299 +       int nested = preempt_count() + sched_rcu_preempt_depth();
17300
17301         return (nested == preempt_offset);
17302  }
17303 diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
17304 index 1ce8867283dc..766da04b06a0 100644
17305 --- a/kernel/sched/deadline.c
17306 +++ b/kernel/sched/deadline.c
17307 @@ -697,6 +697,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
17308
17309         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17310         timer->function = dl_task_timer;
17311 +       timer->irqsafe = 1;
17312  }
17313
17314  static
17315 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
17316 index 2a0a9995256d..48a9b6f57249 100644
17317 --- a/kernel/sched/debug.c
17318 +++ b/kernel/sched/debug.c
17319 @@ -552,6 +552,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
17320         P(rt_throttled);
17321         PN(rt_time);
17322         PN(rt_runtime);
17323 +#ifdef CONFIG_SMP
17324 +       P(rt_nr_migratory);
17325 +#endif
17326
17327  #undef PN
17328  #undef P
17329 @@ -947,6 +950,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
17330  #endif
17331         P(policy);
17332         P(prio);
17333 +#ifdef CONFIG_PREEMPT_RT_FULL
17334 +       P(migrate_disable);
17335 +#endif
17336 +       P(nr_cpus_allowed);
17337  #undef PN
17338  #undef __PN
17339  #undef P
17340 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
17341 index 8b3610c871f2..1145079af264 100644
17342 --- a/kernel/sched/fair.c
17343 +++ b/kernel/sched/fair.c
17344 @@ -3508,7 +3508,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
17345         ideal_runtime = sched_slice(cfs_rq, curr);
17346         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
17347         if (delta_exec > ideal_runtime) {
17348 -               resched_curr(rq_of(cfs_rq));
17349 +               resched_curr_lazy(rq_of(cfs_rq));
17350                 /*
17351                  * The current task ran long enough, ensure it doesn't get
17352                  * re-elected due to buddy favours.
17353 @@ -3532,7 +3532,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
17354                 return;
17355
17356         if (delta > ideal_runtime)
17357 -               resched_curr(rq_of(cfs_rq));
17358 +               resched_curr_lazy(rq_of(cfs_rq));
17359  }
17360
17361  static void
17362 @@ -3677,7 +3677,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
17363          * validating it and just reschedule.
17364          */
17365         if (queued) {
17366 -               resched_curr(rq_of(cfs_rq));
17367 +               resched_curr_lazy(rq_of(cfs_rq));
17368                 return;
17369         }
17370         /*
17371 @@ -3859,7 +3859,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
17372          * hierarchy can be throttled
17373          */
17374         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
17375 -               resched_curr(rq_of(cfs_rq));
17376 +               resched_curr_lazy(rq_of(cfs_rq));
17377  }
17378
17379  static __always_inline
17380 @@ -4487,7 +4487,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
17381
17382                 if (delta < 0) {
17383                         if (rq->curr == p)
17384 -                               resched_curr(rq);
17385 +                               resched_curr_lazy(rq);
17386                         return;
17387                 }
17388                 hrtick_start(rq, delta);
17389 @@ -5676,7 +5676,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
17390         return;
17391
17392  preempt:
17393 -       resched_curr(rq);
17394 +       resched_curr_lazy(rq);
17395         /*
17396          * Only set the backward buddy when the current task is still
17397          * on the rq. This can happen when a wakeup gets interleaved
17398 @@ -8402,7 +8402,7 @@ static void task_fork_fair(struct task_struct *p)
17399                  * 'current' within the tree based on its new key value.
17400                  */
17401                 swap(curr->vruntime, se->vruntime);
17402 -               resched_curr(rq);
17403 +               resched_curr_lazy(rq);
17404         }
17405
17406         se->vruntime -= cfs_rq->min_vruntime;
17407 @@ -8426,7 +8426,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
17408          */
17409         if (rq->curr == p) {
17410                 if (p->prio > oldprio)
17411 -                       resched_curr(rq);
17412 +                       resched_curr_lazy(rq);
17413         } else
17414                 check_preempt_curr(rq, p, 0);
17415  }
17416 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
17417 index 69631fa46c2f..6d28fcd08872 100644
17418 --- a/kernel/sched/features.h
17419 +++ b/kernel/sched/features.h
17420 @@ -45,11 +45,19 @@ SCHED_FEAT(LB_BIAS, true)
17421   */
17422  SCHED_FEAT(NONTASK_CAPACITY, true)
17423
17424 +#ifdef CONFIG_PREEMPT_RT_FULL
17425 +SCHED_FEAT(TTWU_QUEUE, false)
17426 +# ifdef CONFIG_PREEMPT_LAZY
17427 +SCHED_FEAT(PREEMPT_LAZY, true)
17428 +# endif
17429 +#else
17430 +
17431  /*
17432   * Queue remote wakeups on the target CPU and process them
17433   * using the scheduler IPI. Reduces rq->lock contention/bounces.
17434   */
17435  SCHED_FEAT(TTWU_QUEUE, true)
17436 +#endif
17437
17438  #ifdef HAVE_RT_PUSH_IPI
17439  /*
17440 diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
17441 index d5690b722691..731cd0e98c15 100644
17442 --- a/kernel/sched/rt.c
17443 +++ b/kernel/sched/rt.c
17444 @@ -47,6 +47,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
17445
17446         hrtimer_init(&rt_b->rt_period_timer,
17447                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
17448 +       rt_b->rt_period_timer.irqsafe = 1;
17449         rt_b->rt_period_timer.function = sched_rt_period_timer;
17450  }
17451
17452 @@ -101,6 +102,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
17453         rt_rq->push_cpu = nr_cpu_ids;
17454         raw_spin_lock_init(&rt_rq->push_lock);
17455         init_irq_work(&rt_rq->push_work, push_irq_work_func);
17456 +       rt_rq->push_work.flags |= IRQ_WORK_HARD_IRQ;
17457  #endif
17458  #endif /* CONFIG_SMP */
17459         /* We start is dequeued state, because no RT tasks are queued */
17460 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
17461 index c64fc5114004..af58f9b3ece4 100644
17462 --- a/kernel/sched/sched.h
17463 +++ b/kernel/sched/sched.h
17464 @@ -1138,6 +1138,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
17465  #define WF_SYNC                0x01            /* waker goes to sleep after wakeup */
17466  #define WF_FORK                0x02            /* child wakeup after fork */
17467  #define WF_MIGRATED    0x4             /* internal use, task got migrated */
17468 +#define WF_LOCK_SLEEPER        0x08            /* wakeup spinlock "sleeper" */
17469
17470  /*
17471   * To aid in avoiding the subversion of "niceness" due to uneven distribution
17472 @@ -1316,6 +1317,15 @@ extern void init_sched_fair_class(void);
17473  extern void resched_curr(struct rq *rq);
17474  extern void resched_cpu(int cpu);
17475
17476 +#ifdef CONFIG_PREEMPT_LAZY
17477 +extern void resched_curr_lazy(struct rq *rq);
17478 +#else
17479 +static inline void resched_curr_lazy(struct rq *rq)
17480 +{
17481 +       resched_curr(rq);
17482 +}
17483 +#endif
17484 +
17485  extern struct rt_bandwidth def_rt_bandwidth;
17486  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
17487
17488 diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
17489 index 82f0dff90030..ef027ff3250a 100644
17490 --- a/kernel/sched/swait.c
17491 +++ b/kernel/sched/swait.c
17492 @@ -1,5 +1,6 @@
17493  #include <linux/sched.h>
17494  #include <linux/swait.h>
17495 +#include <linux/suspend.h>
17496
17497  void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
17498                              struct lock_class_key *key)
17499 @@ -29,6 +30,25 @@ void swake_up_locked(struct swait_queue_head *q)
17500  }
17501  EXPORT_SYMBOL(swake_up_locked);
17502
17503 +void swake_up_all_locked(struct swait_queue_head *q)
17504 +{
17505 +       struct swait_queue *curr;
17506 +       int wakes = 0;
17507 +
17508 +       while (!list_empty(&q->task_list)) {
17509 +
17510 +               curr = list_first_entry(&q->task_list, typeof(*curr),
17511 +                                       task_list);
17512 +               wake_up_process(curr->task);
17513 +               list_del_init(&curr->task_list);
17514 +               wakes++;
17515 +       }
17516 +       if (pm_in_action)
17517 +               return;
17518 +       WARN(wakes > 2, "complete_all() with %d waiters\n", wakes);
17519 +}
17520 +EXPORT_SYMBOL(swake_up_all_locked);
17521 +
17522  void swake_up(struct swait_queue_head *q)
17523  {
17524         unsigned long flags;
17525 @@ -54,6 +74,7 @@ void swake_up_all(struct swait_queue_head *q)
17526         if (!swait_active(q))
17527                 return;
17528
17529 +       WARN_ON(irqs_disabled());
17530         raw_spin_lock_irq(&q->lock);
17531         list_splice_init(&q->task_list, &tmp);
17532         while (!list_empty(&tmp)) {
17533 diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c
17534 new file mode 100644
17535 index 000000000000..1950f40ca725
17536 --- /dev/null
17537 +++ b/kernel/sched/swork.c
17538 @@ -0,0 +1,173 @@
17539 +/*
17540 + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de
17541 + *
17542 + * Provides a framework for enqueuing callbacks from irq context
17543 + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context.
17544 + */
17545 +
17546 +#include <linux/swait.h>
17547 +#include <linux/swork.h>
17548 +#include <linux/kthread.h>
17549 +#include <linux/slab.h>
17550 +#include <linux/spinlock.h>
17551 +#include <linux/export.h>
17552 +
17553 +#define SWORK_EVENT_PENDING     (1 << 0)
17554 +
17555 +static DEFINE_MUTEX(worker_mutex);
17556 +static struct sworker *glob_worker;
17557 +
17558 +struct sworker {
17559 +       struct list_head events;
17560 +       struct swait_queue_head wq;
17561 +
17562 +       raw_spinlock_t lock;
17563 +
17564 +       struct task_struct *task;
17565 +       int refs;
17566 +};
17567 +
17568 +static bool swork_readable(struct sworker *worker)
17569 +{
17570 +       bool r;
17571 +
17572 +       if (kthread_should_stop())
17573 +               return true;
17574 +
17575 +       raw_spin_lock_irq(&worker->lock);
17576 +       r = !list_empty(&worker->events);
17577 +       raw_spin_unlock_irq(&worker->lock);
17578 +
17579 +       return r;
17580 +}
17581 +
17582 +static int swork_kthread(void *arg)
17583 +{
17584 +       struct sworker *worker = arg;
17585 +
17586 +       for (;;) {
17587 +               swait_event_interruptible(worker->wq,
17588 +                                       swork_readable(worker));
17589 +               if (kthread_should_stop())
17590 +                       break;
17591 +
17592 +               raw_spin_lock_irq(&worker->lock);
17593 +               while (!list_empty(&worker->events)) {
17594 +                       struct swork_event *sev;
17595 +
17596 +                       sev = list_first_entry(&worker->events,
17597 +                                       struct swork_event, item);
17598 +                       list_del(&sev->item);
17599 +                       raw_spin_unlock_irq(&worker->lock);
17600 +
17601 +                       WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING,
17602 +                                                        &sev->flags));
17603 +                       sev->func(sev);
17604 +                       raw_spin_lock_irq(&worker->lock);
17605 +               }
17606 +               raw_spin_unlock_irq(&worker->lock);
17607 +       }
17608 +       return 0;
17609 +}
17610 +
17611 +static struct sworker *swork_create(void)
17612 +{
17613 +       struct sworker *worker;
17614 +
17615 +       worker = kzalloc(sizeof(*worker), GFP_KERNEL);
17616 +       if (!worker)
17617 +               return ERR_PTR(-ENOMEM);
17618 +
17619 +       INIT_LIST_HEAD(&worker->events);
17620 +       raw_spin_lock_init(&worker->lock);
17621 +       init_swait_queue_head(&worker->wq);
17622 +
17623 +       worker->task = kthread_run(swork_kthread, worker, "kswork");
17624 +       if (IS_ERR(worker->task)) {
17625 +               kfree(worker);
17626 +               return ERR_PTR(-ENOMEM);
17627 +       }
17628 +
17629 +       return worker;
17630 +}
17631 +
17632 +static void swork_destroy(struct sworker *worker)
17633 +{
17634 +       kthread_stop(worker->task);
17635 +
17636 +       WARN_ON(!list_empty(&worker->events));
17637 +       kfree(worker);
17638 +}
17639 +
17640 +/**
17641 + * swork_queue - queue swork
17642 + *
17643 + * Returns %false if @work was already on a queue, %true otherwise.
17644 + *
17645 + * The work is queued and processed on a random CPU
17646 + */
17647 +bool swork_queue(struct swork_event *sev)
17648 +{
17649 +       unsigned long flags;
17650 +
17651 +       if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags))
17652 +               return false;
17653 +
17654 +       raw_spin_lock_irqsave(&glob_worker->lock, flags);
17655 +       list_add_tail(&sev->item, &glob_worker->events);
17656 +       raw_spin_unlock_irqrestore(&glob_worker->lock, flags);
17657 +
17658 +       swake_up(&glob_worker->wq);
17659 +       return true;
17660 +}
17661 +EXPORT_SYMBOL_GPL(swork_queue);
17662 +
17663 +/**
17664 + * swork_get - get an instance of the sworker
17665 + *
17666 + * Returns an negative error code if the initialization if the worker did not
17667 + * work, %0 otherwise.
17668 + *
17669 + */
17670 +int swork_get(void)
17671 +{
17672 +       struct sworker *worker;
17673 +
17674 +       mutex_lock(&worker_mutex);
17675 +       if (!glob_worker) {
17676 +               worker = swork_create();
17677 +               if (IS_ERR(worker)) {
17678 +                       mutex_unlock(&worker_mutex);
17679 +                       return -ENOMEM;
17680 +               }
17681 +
17682 +               glob_worker = worker;
17683 +       }
17684 +
17685 +       glob_worker->refs++;
17686 +       mutex_unlock(&worker_mutex);
17687 +
17688 +       return 0;
17689 +}
17690 +EXPORT_SYMBOL_GPL(swork_get);
17691 +
17692 +/**
17693 + * swork_put - puts an instance of the sworker
17694 + *
17695 + * Will destroy the sworker thread. This function must not be called until all
17696 + * queued events have been completed.
17697 + */
17698 +void swork_put(void)
17699 +{
17700 +       mutex_lock(&worker_mutex);
17701 +
17702 +       glob_worker->refs--;
17703 +       if (glob_worker->refs > 0)
17704 +               goto out;
17705 +
17706 +       swork_destroy(glob_worker);
17707 +       glob_worker = NULL;
17708 +out:
17709 +       mutex_unlock(&worker_mutex);
17710 +}
17711 +EXPORT_SYMBOL_GPL(swork_put);
17712 diff --git a/kernel/signal.c b/kernel/signal.c
17713 index af21afc00d08..7ead97a43298 100644
17714 --- a/kernel/signal.c
17715 +++ b/kernel/signal.c
17716 @@ -14,6 +14,7 @@
17717  #include <linux/export.h>
17718  #include <linux/init.h>
17719  #include <linux/sched.h>
17720 +#include <linux/sched/rt.h>
17721  #include <linux/fs.h>
17722  #include <linux/tty.h>
17723  #include <linux/binfmts.h>
17724 @@ -352,13 +353,30 @@ static bool task_participate_group_stop(struct task_struct *task)
17725         return false;
17726  }
17727
17728 +static inline struct sigqueue *get_task_cache(struct task_struct *t)
17729 +{
17730 +       struct sigqueue *q = t->sigqueue_cache;
17731 +
17732 +       if (cmpxchg(&t->sigqueue_cache, q, NULL) != q)
17733 +               return NULL;
17734 +       return q;
17735 +}
17736 +
17737 +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q)
17738 +{
17739 +       if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL)
17740 +               return 0;
17741 +       return 1;
17742 +}
17743 +
17744  /*
17745   * allocate a new signal queue record
17746   * - this may be called without locks if and only if t == current, otherwise an
17747   *   appropriate lock must be held to stop the target task from exiting
17748   */
17749  static struct sigqueue *
17750 -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
17751 +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags,
17752 +                   int override_rlimit, int fromslab)
17753  {
17754         struct sigqueue *q = NULL;
17755         struct user_struct *user;
17756 @@ -375,7 +393,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
17757         if (override_rlimit ||
17758             atomic_read(&user->sigpending) <=
17759                         task_rlimit(t, RLIMIT_SIGPENDING)) {
17760 -               q = kmem_cache_alloc(sigqueue_cachep, flags);
17761 +               if (!fromslab)
17762 +                       q = get_task_cache(t);
17763 +               if (!q)
17764 +                       q = kmem_cache_alloc(sigqueue_cachep, flags);
17765         } else {
17766                 print_dropped_signal(sig);
17767         }
17768 @@ -392,6 +413,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
17769         return q;
17770  }
17771
17772 +static struct sigqueue *
17773 +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags,
17774 +                int override_rlimit)
17775 +{
17776 +       return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0);
17777 +}
17778 +
17779  static void __sigqueue_free(struct sigqueue *q)
17780  {
17781         if (q->flags & SIGQUEUE_PREALLOC)
17782 @@ -401,6 +429,21 @@ static void __sigqueue_free(struct sigqueue *q)
17783         kmem_cache_free(sigqueue_cachep, q);
17784  }
17785
17786 +static void sigqueue_free_current(struct sigqueue *q)
17787 +{
17788 +       struct user_struct *up;
17789 +
17790 +       if (q->flags & SIGQUEUE_PREALLOC)
17791 +               return;
17792 +
17793 +       up = q->user;
17794 +       if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) {
17795 +               atomic_dec(&up->sigpending);
17796 +               free_uid(up);
17797 +       } else
17798 +                 __sigqueue_free(q);
17799 +}
17800 +
17801  void flush_sigqueue(struct sigpending *queue)
17802  {
17803         struct sigqueue *q;
17804 @@ -414,6 +457,21 @@ void flush_sigqueue(struct sigpending *queue)
17805  }
17806
17807  /*
17808 + * Called from __exit_signal. Flush tsk->pending and
17809 + * tsk->sigqueue_cache
17810 + */
17811 +void flush_task_sigqueue(struct task_struct *tsk)
17812 +{
17813 +       struct sigqueue *q;
17814 +
17815 +       flush_sigqueue(&tsk->pending);
17816 +
17817 +       q = get_task_cache(tsk);
17818 +       if (q)
17819 +               kmem_cache_free(sigqueue_cachep, q);
17820 +}
17821 +
17822 +/*
17823   * Flush all pending signals for this kthread.
17824   */
17825  void flush_signals(struct task_struct *t)
17826 @@ -525,7 +583,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
17827  still_pending:
17828                 list_del_init(&first->list);
17829                 copy_siginfo(info, &first->info);
17830 -               __sigqueue_free(first);
17831 +               sigqueue_free_current(first);
17832         } else {
17833                 /*
17834                  * Ok, it wasn't in the queue.  This must be
17835 @@ -560,6 +618,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
17836  {
17837         int signr;
17838
17839 +       WARN_ON_ONCE(tsk != current);
17840 +
17841         /* We only dequeue private signals from ourselves, we don't let
17842          * signalfd steal them
17843          */
17844 @@ -1156,8 +1216,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
17845   * We don't want to have recursive SIGSEGV's etc, for example,
17846   * that is why we also clear SIGNAL_UNKILLABLE.
17847   */
17848 -int
17849 -force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17850 +static int
17851 +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17852  {
17853         unsigned long int flags;
17854         int ret, blocked, ignored;
17855 @@ -1182,6 +1242,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17856         return ret;
17857  }
17858
17859 +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
17860 +{
17861 +/*
17862 + * On some archs, PREEMPT_RT has to delay sending a signal from a trap
17863 + * since it can not enable preemption, and the signal code's spin_locks
17864 + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will
17865 + * send the signal on exit of the trap.
17866 + */
17867 +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND
17868 +       if (in_atomic()) {
17869 +               if (WARN_ON_ONCE(t != current))
17870 +                       return 0;
17871 +               if (WARN_ON_ONCE(t->forced_info.si_signo))
17872 +                       return 0;
17873 +
17874 +               if (is_si_special(info)) {
17875 +                       WARN_ON_ONCE(info != SEND_SIG_PRIV);
17876 +                       t->forced_info.si_signo = sig;
17877 +                       t->forced_info.si_errno = 0;
17878 +                       t->forced_info.si_code = SI_KERNEL;
17879 +                       t->forced_info.si_pid = 0;
17880 +                       t->forced_info.si_uid = 0;
17881 +               } else {
17882 +                       t->forced_info = *info;
17883 +               }
17884 +
17885 +               set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
17886 +               return 0;
17887 +       }
17888 +#endif
17889 +       return do_force_sig_info(sig, info, t);
17890 +}
17891 +
17892  /*
17893   * Nuke all other threads in the group.
17894   */
17895 @@ -1216,12 +1309,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
17896                  * Disable interrupts early to avoid deadlocks.
17897                  * See rcu_read_unlock() comment header for details.
17898                  */
17899 -               local_irq_save(*flags);
17900 +               local_irq_save_nort(*flags);
17901                 rcu_read_lock();
17902                 sighand = rcu_dereference(tsk->sighand);
17903                 if (unlikely(sighand == NULL)) {
17904                         rcu_read_unlock();
17905 -                       local_irq_restore(*flags);
17906 +                       local_irq_restore_nort(*flags);
17907                         break;
17908                 }
17909                 /*
17910 @@ -1242,7 +1335,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
17911                 }
17912                 spin_unlock(&sighand->siglock);
17913                 rcu_read_unlock();
17914 -               local_irq_restore(*flags);
17915 +               local_irq_restore_nort(*flags);
17916         }
17917
17918         return sighand;
17919 @@ -1485,7 +1578,8 @@ EXPORT_SYMBOL(kill_pid);
17920   */
17921  struct sigqueue *sigqueue_alloc(void)
17922  {
17923 -       struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
17924 +       /* Preallocated sigqueue objects always from the slabcache ! */
17925 +       struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1);
17926
17927         if (q)
17928                 q->flags |= SIGQUEUE_PREALLOC;
17929 @@ -1846,15 +1940,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
17930                 if (gstop_done && ptrace_reparented(current))
17931                         do_notify_parent_cldstop(current, false, why);
17932
17933 -               /*
17934 -                * Don't want to allow preemption here, because
17935 -                * sys_ptrace() needs this task to be inactive.
17936 -                *
17937 -                * XXX: implement read_unlock_no_resched().
17938 -                */
17939 -               preempt_disable();
17940                 read_unlock(&tasklist_lock);
17941 -               preempt_enable_no_resched();
17942                 freezable_schedule();
17943         } else {
17944                 /*
17945 diff --git a/kernel/softirq.c b/kernel/softirq.c
17946 index 17caf4b63342..a602b7152de7 100644
17947 --- a/kernel/softirq.c
17948 +++ b/kernel/softirq.c
17949 @@ -21,10 +21,12 @@
17950  #include <linux/freezer.h>
17951  #include <linux/kthread.h>
17952  #include <linux/rcupdate.h>
17953 +#include <linux/delay.h>
17954  #include <linux/ftrace.h>
17955  #include <linux/smp.h>
17956  #include <linux/smpboot.h>
17957  #include <linux/tick.h>
17958 +#include <linux/locallock.h>
17959  #include <linux/irq.h>
17960
17961  #define CREATE_TRACE_POINTS
17962 @@ -56,12 +58,108 @@ EXPORT_SYMBOL(irq_stat);
17963  static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
17964
17965  DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
17966 +#ifdef CONFIG_PREEMPT_RT_FULL
17967 +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ))
17968 +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd);
17969 +#endif
17970
17971  const char * const softirq_to_name[NR_SOFTIRQS] = {
17972         "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
17973         "TASKLET", "SCHED", "HRTIMER", "RCU"
17974  };
17975
17976 +#ifdef CONFIG_NO_HZ_COMMON
17977 +# ifdef CONFIG_PREEMPT_RT_FULL
17978 +
17979 +struct softirq_runner {
17980 +       struct task_struct *runner[NR_SOFTIRQS];
17981 +};
17982 +
17983 +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners);
17984 +
17985 +static inline void softirq_set_runner(unsigned int sirq)
17986 +{
17987 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
17988 +
17989 +       sr->runner[sirq] = current;
17990 +}
17991 +
17992 +static inline void softirq_clr_runner(unsigned int sirq)
17993 +{
17994 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
17995 +
17996 +       sr->runner[sirq] = NULL;
17997 +}
17998 +
17999 +/*
18000 + * On preempt-rt a softirq running context might be blocked on a
18001 + * lock. There might be no other runnable task on this CPU because the
18002 + * lock owner runs on some other CPU. So we have to go into idle with
18003 + * the pending bit set. Therefor we need to check this otherwise we
18004 + * warn about false positives which confuses users and defeats the
18005 + * whole purpose of this test.
18006 + *
18007 + * This code is called with interrupts disabled.
18008 + */
18009 +void softirq_check_pending_idle(void)
18010 +{
18011 +       static int rate_limit;
18012 +       struct softirq_runner *sr = this_cpu_ptr(&softirq_runners);
18013 +       u32 warnpending;
18014 +       int i;
18015 +
18016 +       if (rate_limit >= 10)
18017 +               return;
18018 +
18019 +       warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK;
18020 +       for (i = 0; i < NR_SOFTIRQS; i++) {
18021 +               struct task_struct *tsk = sr->runner[i];
18022 +
18023 +               /*
18024 +                * The wakeup code in rtmutex.c wakes up the task
18025 +                * _before_ it sets pi_blocked_on to NULL under
18026 +                * tsk->pi_lock. So we need to check for both: state
18027 +                * and pi_blocked_on.
18028 +                */
18029 +               if (tsk) {
18030 +                       raw_spin_lock(&tsk->pi_lock);
18031 +                       if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) {
18032 +                               /* Clear all bits pending in that task */
18033 +                               warnpending &= ~(tsk->softirqs_raised);
18034 +                               warnpending &= ~(1 << i);
18035 +                       }
18036 +                       raw_spin_unlock(&tsk->pi_lock);
18037 +               }
18038 +       }
18039 +
18040 +       if (warnpending) {
18041 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
18042 +                      warnpending);
18043 +               rate_limit++;
18044 +       }
18045 +}
18046 +# else
18047 +/*
18048 + * On !PREEMPT_RT we just printk rate limited:
18049 + */
18050 +void softirq_check_pending_idle(void)
18051 +{
18052 +       static int rate_limit;
18053 +
18054 +       if (rate_limit < 10 &&
18055 +                       (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
18056 +               printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
18057 +                      local_softirq_pending());
18058 +               rate_limit++;
18059 +       }
18060 +}
18061 +# endif
18062 +
18063 +#else /* !CONFIG_NO_HZ_COMMON */
18064 +static inline void softirq_set_runner(unsigned int sirq) { }
18065 +static inline void softirq_clr_runner(unsigned int sirq) { }
18066 +#endif
18067 +
18068  /*
18069   * we cannot loop indefinitely here to avoid userspace starvation,
18070   * but we also don't want to introduce a worst case 1/HZ latency
18071 @@ -77,6 +175,79 @@ static void wakeup_softirqd(void)
18072                 wake_up_process(tsk);
18073  }
18074
18075 +#ifdef CONFIG_PREEMPT_RT_FULL
18076 +static void wakeup_timer_softirqd(void)
18077 +{
18078 +       /* Interrupts are disabled: no need to stop preemption */
18079 +       struct task_struct *tsk = __this_cpu_read(ktimer_softirqd);
18080 +
18081 +       if (tsk && tsk->state != TASK_RUNNING)
18082 +               wake_up_process(tsk);
18083 +}
18084 +#endif
18085 +
18086 +static void handle_softirq(unsigned int vec_nr)
18087 +{
18088 +       struct softirq_action *h = softirq_vec + vec_nr;
18089 +       int prev_count;
18090 +
18091 +       prev_count = preempt_count();
18092 +
18093 +       kstat_incr_softirqs_this_cpu(vec_nr);
18094 +
18095 +       trace_softirq_entry(vec_nr);
18096 +       h->action(h);
18097 +       trace_softirq_exit(vec_nr);
18098 +       if (unlikely(prev_count != preempt_count())) {
18099 +               pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
18100 +                      vec_nr, softirq_to_name[vec_nr], h->action,
18101 +                      prev_count, preempt_count());
18102 +               preempt_count_set(prev_count);
18103 +       }
18104 +}
18105 +
18106 +#ifndef CONFIG_PREEMPT_RT_FULL
18107 +static inline int ksoftirqd_softirq_pending(void)
18108 +{
18109 +       return local_softirq_pending();
18110 +}
18111 +
18112 +static void handle_pending_softirqs(u32 pending)
18113 +{
18114 +       struct softirq_action *h = softirq_vec;
18115 +       int softirq_bit;
18116 +
18117 +       local_irq_enable();
18118 +
18119 +       h = softirq_vec;
18120 +
18121 +       while ((softirq_bit = ffs(pending))) {
18122 +               unsigned int vec_nr;
18123 +
18124 +               h += softirq_bit - 1;
18125 +               vec_nr = h - softirq_vec;
18126 +               handle_softirq(vec_nr);
18127 +
18128 +               h++;
18129 +               pending >>= softirq_bit;
18130 +       }
18131 +
18132 +       rcu_bh_qs();
18133 +       local_irq_disable();
18134 +}
18135 +
18136 +static void run_ksoftirqd(unsigned int cpu)
18137 +{
18138 +       local_irq_disable();
18139 +       if (ksoftirqd_softirq_pending()) {
18140 +               __do_softirq();
18141 +               local_irq_enable();
18142 +               cond_resched_rcu_qs();
18143 +               return;
18144 +       }
18145 +       local_irq_enable();
18146 +}
18147 +
18148  /*
18149   * preempt_count and SOFTIRQ_OFFSET usage:
18150   * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
18151 @@ -232,10 +403,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
18152         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
18153         unsigned long old_flags = current->flags;
18154         int max_restart = MAX_SOFTIRQ_RESTART;
18155 -       struct softirq_action *h;
18156         bool in_hardirq;
18157         __u32 pending;
18158 -       int softirq_bit;
18159
18160         /*
18161          * Mask out PF_MEMALLOC s current task context is borrowed for the
18162 @@ -254,36 +423,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
18163         /* Reset the pending bitmask before enabling irqs */
18164         set_softirq_pending(0);
18165
18166 -       local_irq_enable();
18167 -
18168 -       h = softirq_vec;
18169 -
18170 -       while ((softirq_bit = ffs(pending))) {
18171 -               unsigned int vec_nr;
18172 -               int prev_count;
18173 -
18174 -               h += softirq_bit - 1;
18175 -
18176 -               vec_nr = h - softirq_vec;
18177 -               prev_count = preempt_count();
18178 -
18179 -               kstat_incr_softirqs_this_cpu(vec_nr);
18180 -
18181 -               trace_softirq_entry(vec_nr);
18182 -               h->action(h);
18183 -               trace_softirq_exit(vec_nr);
18184 -               if (unlikely(prev_count != preempt_count())) {
18185 -                       pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
18186 -                              vec_nr, softirq_to_name[vec_nr], h->action,
18187 -                              prev_count, preempt_count());
18188 -                       preempt_count_set(prev_count);
18189 -               }
18190 -               h++;
18191 -               pending >>= softirq_bit;
18192 -       }
18193 -
18194 -       rcu_bh_qs();
18195 -       local_irq_disable();
18196 +       handle_pending_softirqs(pending);
18197
18198         pending = local_softirq_pending();
18199         if (pending) {
18200 @@ -320,6 +460,310 @@ asmlinkage __visible void do_softirq(void)
18201  }
18202
18203  /*
18204 + * This function must run with irqs disabled!
18205 + */
18206 +void raise_softirq_irqoff(unsigned int nr)
18207 +{
18208 +       __raise_softirq_irqoff(nr);
18209 +
18210 +       /*
18211 +        * If we're in an interrupt or softirq, we're done
18212 +        * (this also catches softirq-disabled code). We will
18213 +        * actually run the softirq once we return from
18214 +        * the irq or softirq.
18215 +        *
18216 +        * Otherwise we wake up ksoftirqd to make sure we
18217 +        * schedule the softirq soon.
18218 +        */
18219 +       if (!in_interrupt())
18220 +               wakeup_softirqd();
18221 +}
18222 +
18223 +void __raise_softirq_irqoff(unsigned int nr)
18224 +{
18225 +       trace_softirq_raise(nr);
18226 +       or_softirq_pending(1UL << nr);
18227 +}
18228 +
18229 +static inline void local_bh_disable_nort(void) { local_bh_disable(); }
18230 +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); }
18231 +static void ksoftirqd_set_sched_params(unsigned int cpu) { }
18232 +
18233 +#else /* !PREEMPT_RT_FULL */
18234 +
18235 +/*
18236 + * On RT we serialize softirq execution with a cpu local lock per softirq
18237 + */
18238 +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks);
18239 +
18240 +void __init softirq_early_init(void)
18241 +{
18242 +       int i;
18243 +
18244 +       for (i = 0; i < NR_SOFTIRQS; i++)
18245 +               local_irq_lock_init(local_softirq_locks[i]);
18246 +}
18247 +
18248 +static void lock_softirq(int which)
18249 +{
18250 +       local_lock(local_softirq_locks[which]);
18251 +}
18252 +
18253 +static void unlock_softirq(int which)
18254 +{
18255 +       local_unlock(local_softirq_locks[which]);
18256 +}
18257 +
18258 +static void do_single_softirq(int which)
18259 +{
18260 +       unsigned long old_flags = current->flags;
18261 +
18262 +       current->flags &= ~PF_MEMALLOC;
18263 +       vtime_account_irq_enter(current);
18264 +       current->flags |= PF_IN_SOFTIRQ;
18265 +       lockdep_softirq_enter();
18266 +       local_irq_enable();
18267 +       handle_softirq(which);
18268 +       local_irq_disable();
18269 +       lockdep_softirq_exit();
18270 +       current->flags &= ~PF_IN_SOFTIRQ;
18271 +       vtime_account_irq_enter(current);
18272 +       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
18273 +}
18274 +
18275 +/*
18276 + * Called with interrupts disabled. Process softirqs which were raised
18277 + * in current context (or on behalf of ksoftirqd).
18278 + */
18279 +static void do_current_softirqs(void)
18280 +{
18281 +       while (current->softirqs_raised) {
18282 +               int i = __ffs(current->softirqs_raised);
18283 +               unsigned int pending, mask = (1U << i);
18284 +
18285 +               current->softirqs_raised &= ~mask;
18286 +               local_irq_enable();
18287 +
18288 +               /*
18289 +                * If the lock is contended, we boost the owner to
18290 +                * process the softirq or leave the critical section
18291 +                * now.
18292 +                */
18293 +               lock_softirq(i);
18294 +               local_irq_disable();
18295 +               softirq_set_runner(i);
18296 +               /*
18297 +                * Check with the local_softirq_pending() bits,
18298 +                * whether we need to process this still or if someone
18299 +                * else took care of it.
18300 +                */
18301 +               pending = local_softirq_pending();
18302 +               if (pending & mask) {
18303 +                       set_softirq_pending(pending & ~mask);
18304 +                       do_single_softirq(i);
18305 +               }
18306 +               softirq_clr_runner(i);
18307 +               WARN_ON(current->softirq_nestcnt != 1);
18308 +               local_irq_enable();
18309 +               unlock_softirq(i);
18310 +               local_irq_disable();
18311 +       }
18312 +}
18313 +
18314 +void __local_bh_disable(void)
18315 +{
18316 +       if (++current->softirq_nestcnt == 1)
18317 +               migrate_disable();
18318 +}
18319 +EXPORT_SYMBOL(__local_bh_disable);
18320 +
18321 +void __local_bh_enable(void)
18322 +{
18323 +       if (WARN_ON(current->softirq_nestcnt == 0))
18324 +               return;
18325 +
18326 +       local_irq_disable();
18327 +       if (current->softirq_nestcnt == 1 && current->softirqs_raised)
18328 +               do_current_softirqs();
18329 +       local_irq_enable();
18330 +
18331 +       if (--current->softirq_nestcnt == 0)
18332 +               migrate_enable();
18333 +}
18334 +EXPORT_SYMBOL(__local_bh_enable);
18335 +
18336 +void _local_bh_enable(void)
18337 +{
18338 +       if (WARN_ON(current->softirq_nestcnt == 0))
18339 +               return;
18340 +       if (--current->softirq_nestcnt == 0)
18341 +               migrate_enable();
18342 +}
18343 +EXPORT_SYMBOL(_local_bh_enable);
18344 +
18345 +int in_serving_softirq(void)
18346 +{
18347 +       return current->flags & PF_IN_SOFTIRQ;
18348 +}
18349 +EXPORT_SYMBOL(in_serving_softirq);
18350 +
18351 +/* Called with preemption disabled */
18352 +static void run_ksoftirqd(unsigned int cpu)
18353 +{
18354 +       local_irq_disable();
18355 +       current->softirq_nestcnt++;
18356 +
18357 +       do_current_softirqs();
18358 +       current->softirq_nestcnt--;
18359 +       local_irq_enable();
18360 +       cond_resched_rcu_qs();
18361 +}
18362 +
18363 +/*
18364 + * Called from netif_rx_ni(). Preemption enabled, but migration
18365 + * disabled. So the cpu can't go away under us.
18366 + */
18367 +void thread_do_softirq(void)
18368 +{
18369 +       if (!in_serving_softirq() && current->softirqs_raised) {
18370 +               current->softirq_nestcnt++;
18371 +               do_current_softirqs();
18372 +               current->softirq_nestcnt--;
18373 +       }
18374 +}
18375 +
18376 +static void do_raise_softirq_irqoff(unsigned int nr)
18377 +{
18378 +       unsigned int mask;
18379 +
18380 +       mask = 1UL << nr;
18381 +
18382 +       trace_softirq_raise(nr);
18383 +       or_softirq_pending(mask);
18384 +
18385 +       /*
18386 +        * If we are not in a hard interrupt and inside a bh disabled
18387 +        * region, we simply raise the flag on current. local_bh_enable()
18388 +        * will make sure that the softirq is executed. Otherwise we
18389 +        * delegate it to ksoftirqd.
18390 +        */
18391 +       if (!in_irq() && current->softirq_nestcnt)
18392 +               current->softirqs_raised |= mask;
18393 +       else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd))
18394 +               return;
18395 +
18396 +       if (mask & TIMER_SOFTIRQS)
18397 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
18398 +       else
18399 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
18400 +}
18401 +
18402 +static void wakeup_proper_softirq(unsigned int nr)
18403 +{
18404 +       if ((1UL << nr) & TIMER_SOFTIRQS)
18405 +               wakeup_timer_softirqd();
18406 +       else
18407 +               wakeup_softirqd();
18408 +}
18409 +
18410 +
18411 +void __raise_softirq_irqoff(unsigned int nr)
18412 +{
18413 +       do_raise_softirq_irqoff(nr);
18414 +       if (!in_irq() && !current->softirq_nestcnt)
18415 +               wakeup_proper_softirq(nr);
18416 +}
18417 +
18418 +/*
18419 + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
18420 + */
18421 +void __raise_softirq_irqoff_ksoft(unsigned int nr)
18422 +{
18423 +       unsigned int mask;
18424 +
18425 +       if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) ||
18426 +                        !__this_cpu_read(ktimer_softirqd)))
18427 +               return;
18428 +       mask = 1UL << nr;
18429 +
18430 +       trace_softirq_raise(nr);
18431 +       or_softirq_pending(mask);
18432 +       if (mask & TIMER_SOFTIRQS)
18433 +               __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask;
18434 +       else
18435 +               __this_cpu_read(ksoftirqd)->softirqs_raised |= mask;
18436 +       wakeup_proper_softirq(nr);
18437 +}
18438 +
18439 +/*
18440 + * This function must run with irqs disabled!
18441 + */
18442 +void raise_softirq_irqoff(unsigned int nr)
18443 +{
18444 +       do_raise_softirq_irqoff(nr);
18445 +
18446 +       /*
18447 +        * If we're in an hard interrupt we let irq return code deal
18448 +        * with the wakeup of ksoftirqd.
18449 +        */
18450 +       if (in_irq())
18451 +               return;
18452 +       /*
18453 +        * If we are in thread context but outside of a bh disabled
18454 +        * region, we need to wake ksoftirqd as well.
18455 +        *
18456 +        * CHECKME: Some of the places which do that could be wrapped
18457 +        * into local_bh_disable/enable pairs. Though it's unclear
18458 +        * whether this is worth the effort. To find those places just
18459 +        * raise a WARN() if the condition is met.
18460 +        */
18461 +       if (!current->softirq_nestcnt)
18462 +               wakeup_proper_softirq(nr);
18463 +}
18464 +
18465 +static inline int ksoftirqd_softirq_pending(void)
18466 +{
18467 +       return current->softirqs_raised;
18468 +}
18469 +
18470 +static inline void local_bh_disable_nort(void) { }
18471 +static inline void _local_bh_enable_nort(void) { }
18472 +
18473 +static inline void ksoftirqd_set_sched_params(unsigned int cpu)
18474 +{
18475 +       /* Take over all but timer pending softirqs when starting */
18476 +       local_irq_disable();
18477 +       current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS;
18478 +       local_irq_enable();
18479 +}
18480 +
18481 +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu)
18482 +{
18483 +       struct sched_param param = { .sched_priority = 1 };
18484 +
18485 +       sched_setscheduler(current, SCHED_FIFO, &param);
18486 +
18487 +       /* Take over timer pending softirqs when starting */
18488 +       local_irq_disable();
18489 +       current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS;
18490 +       local_irq_enable();
18491 +}
18492 +
18493 +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu,
18494 +                                                   bool online)
18495 +{
18496 +       struct sched_param param = { .sched_priority = 0 };
18497 +
18498 +       sched_setscheduler(current, SCHED_NORMAL, &param);
18499 +}
18500 +
18501 +static int ktimer_softirqd_should_run(unsigned int cpu)
18502 +{
18503 +       return current->softirqs_raised;
18504 +}
18505 +
18506 +#endif /* PREEMPT_RT_FULL */
18507 +/*
18508   * Enter an interrupt context.
18509   */
18510  void irq_enter(void)
18511 @@ -330,9 +774,9 @@ void irq_enter(void)
18512                  * Prevent raise_softirq from needlessly waking up ksoftirqd
18513                  * here, as softirq will be serviced on return from interrupt.
18514                  */
18515 -               local_bh_disable();
18516 +               local_bh_disable_nort();
18517                 tick_irq_enter();
18518 -               _local_bh_enable();
18519 +               _local_bh_enable_nort();
18520         }
18521
18522         __irq_enter();
18523 @@ -340,6 +784,7 @@ void irq_enter(void)
18524
18525  static inline void invoke_softirq(void)
18526  {
18527 +#ifndef CONFIG_PREEMPT_RT_FULL
18528         if (!force_irqthreads) {
18529  #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
18530                 /*
18531 @@ -359,6 +804,18 @@ static inline void invoke_softirq(void)
18532         } else {
18533                 wakeup_softirqd();
18534         }
18535 +#else /* PREEMPT_RT_FULL */
18536 +       unsigned long flags;
18537 +
18538 +       local_irq_save(flags);
18539 +       if (__this_cpu_read(ksoftirqd) &&
18540 +                       __this_cpu_read(ksoftirqd)->softirqs_raised)
18541 +               wakeup_softirqd();
18542 +       if (__this_cpu_read(ktimer_softirqd) &&
18543 +                       __this_cpu_read(ktimer_softirqd)->softirqs_raised)
18544 +               wakeup_timer_softirqd();
18545 +       local_irq_restore(flags);
18546 +#endif
18547  }
18548
18549  static inline void tick_irq_exit(void)
18550 @@ -395,26 +852,6 @@ void irq_exit(void)
18551         trace_hardirq_exit(); /* must be last! */
18552  }
18553
18554 -/*
18555 - * This function must run with irqs disabled!
18556 - */
18557 -inline void raise_softirq_irqoff(unsigned int nr)
18558 -{
18559 -       __raise_softirq_irqoff(nr);
18560 -
18561 -       /*
18562 -        * If we're in an interrupt or softirq, we're done
18563 -        * (this also catches softirq-disabled code). We will
18564 -        * actually run the softirq once we return from
18565 -        * the irq or softirq.
18566 -        *
18567 -        * Otherwise we wake up ksoftirqd to make sure we
18568 -        * schedule the softirq soon.
18569 -        */
18570 -       if (!in_interrupt())
18571 -               wakeup_softirqd();
18572 -}
18573 -
18574  void raise_softirq(unsigned int nr)
18575  {
18576         unsigned long flags;
18577 @@ -424,12 +861,6 @@ void raise_softirq(unsigned int nr)
18578         local_irq_restore(flags);
18579  }
18580
18581 -void __raise_softirq_irqoff(unsigned int nr)
18582 -{
18583 -       trace_softirq_raise(nr);
18584 -       or_softirq_pending(1UL << nr);
18585 -}
18586 -
18587  void open_softirq(int nr, void (*action)(struct softirq_action *))
18588  {
18589         softirq_vec[nr].action = action;
18590 @@ -446,15 +877,45 @@ struct tasklet_head {
18591  static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
18592  static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
18593
18594 +static void inline
18595 +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr)
18596 +{
18597 +       if (tasklet_trylock(t)) {
18598 +again:
18599 +               /* We may have been preempted before tasklet_trylock
18600 +                * and __tasklet_action may have already run.
18601 +                * So double check the sched bit while the takslet
18602 +                * is locked before adding it to the list.
18603 +                */
18604 +               if (test_bit(TASKLET_STATE_SCHED, &t->state)) {
18605 +                       t->next = NULL;
18606 +                       *head->tail = t;
18607 +                       head->tail = &(t->next);
18608 +                       raise_softirq_irqoff(nr);
18609 +                       tasklet_unlock(t);
18610 +               } else {
18611 +                       /* This is subtle. If we hit the corner case above
18612 +                        * It is possible that we get preempted right here,
18613 +                        * and another task has successfully called
18614 +                        * tasklet_schedule(), then this function, and
18615 +                        * failed on the trylock. Thus we must be sure
18616 +                        * before releasing the tasklet lock, that the
18617 +                        * SCHED_BIT is clear. Otherwise the tasklet
18618 +                        * may get its SCHED_BIT set, but not added to the
18619 +                        * list
18620 +                        */
18621 +                       if (!tasklet_tryunlock(t))
18622 +                               goto again;
18623 +               }
18624 +       }
18625 +}
18626 +
18627  void __tasklet_schedule(struct tasklet_struct *t)
18628  {
18629         unsigned long flags;
18630
18631         local_irq_save(flags);
18632 -       t->next = NULL;
18633 -       *__this_cpu_read(tasklet_vec.tail) = t;
18634 -       __this_cpu_write(tasklet_vec.tail, &(t->next));
18635 -       raise_softirq_irqoff(TASKLET_SOFTIRQ);
18636 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
18637         local_irq_restore(flags);
18638  }
18639  EXPORT_SYMBOL(__tasklet_schedule);
18640 @@ -464,10 +925,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
18641         unsigned long flags;
18642
18643         local_irq_save(flags);
18644 -       t->next = NULL;
18645 -       *__this_cpu_read(tasklet_hi_vec.tail) = t;
18646 -       __this_cpu_write(tasklet_hi_vec.tail,  &(t->next));
18647 -       raise_softirq_irqoff(HI_SOFTIRQ);
18648 +       __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
18649         local_irq_restore(flags);
18650  }
18651  EXPORT_SYMBOL(__tasklet_hi_schedule);
18652 @@ -476,82 +934,122 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)
18653  {
18654         BUG_ON(!irqs_disabled());
18655
18656 -       t->next = __this_cpu_read(tasklet_hi_vec.head);
18657 -       __this_cpu_write(tasklet_hi_vec.head, t);
18658 -       __raise_softirq_irqoff(HI_SOFTIRQ);
18659 +       __tasklet_hi_schedule(t);
18660  }
18661  EXPORT_SYMBOL(__tasklet_hi_schedule_first);
18662
18663 -static void tasklet_action(struct softirq_action *a)
18664 +void  tasklet_enable(struct tasklet_struct *t)
18665  {
18666 -       struct tasklet_struct *list;
18667 +       if (!atomic_dec_and_test(&t->count))
18668 +               return;
18669 +       if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state))
18670 +               tasklet_schedule(t);
18671 +}
18672 +EXPORT_SYMBOL(tasklet_enable);
18673
18674 -       local_irq_disable();
18675 -       list = __this_cpu_read(tasklet_vec.head);
18676 -       __this_cpu_write(tasklet_vec.head, NULL);
18677 -       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
18678 -       local_irq_enable();
18679 +static void __tasklet_action(struct softirq_action *a,
18680 +                            struct tasklet_struct *list)
18681 +{
18682 +       int loops = 1000000;
18683
18684         while (list) {
18685                 struct tasklet_struct *t = list;
18686
18687                 list = list->next;
18688
18689 -               if (tasklet_trylock(t)) {
18690 -                       if (!atomic_read(&t->count)) {
18691 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
18692 -                                                       &t->state))
18693 -                                       BUG();
18694 -                               t->func(t->data);
18695 -                               tasklet_unlock(t);
18696 -                               continue;
18697 -                       }
18698 -                       tasklet_unlock(t);
18699 +               /*
18700 +                * Should always succeed - after a tasklist got on the
18701 +                * list (after getting the SCHED bit set from 0 to 1),
18702 +                * nothing but the tasklet softirq it got queued to can
18703 +                * lock it:
18704 +                */
18705 +               if (!tasklet_trylock(t)) {
18706 +                       WARN_ON(1);
18707 +                       continue;
18708                 }
18709
18710 -               local_irq_disable();
18711                 t->next = NULL;
18712 -               *__this_cpu_read(tasklet_vec.tail) = t;
18713 -               __this_cpu_write(tasklet_vec.tail, &(t->next));
18714 -               __raise_softirq_irqoff(TASKLET_SOFTIRQ);
18715 -               local_irq_enable();
18716 +
18717 +               /*
18718 +                * If we cannot handle the tasklet because it's disabled,
18719 +                * mark it as pending. tasklet_enable() will later
18720 +                * re-schedule the tasklet.
18721 +                */
18722 +               if (unlikely(atomic_read(&t->count))) {
18723 +out_disabled:
18724 +                       /* implicit unlock: */
18725 +                       wmb();
18726 +                       t->state = TASKLET_STATEF_PENDING;
18727 +                       continue;
18728 +               }
18729 +
18730 +               /*
18731 +                * After this point on the tasklet might be rescheduled
18732 +                * on another CPU, but it can only be added to another
18733 +                * CPU's tasklet list if we unlock the tasklet (which we
18734 +                * dont do yet).
18735 +                */
18736 +               if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
18737 +                       WARN_ON(1);
18738 +
18739 +again:
18740 +               t->func(t->data);
18741 +
18742 +               /*
18743 +                * Try to unlock the tasklet. We must use cmpxchg, because
18744 +                * another CPU might have scheduled or disabled the tasklet.
18745 +                * We only allow the STATE_RUN -> 0 transition here.
18746 +                */
18747 +               while (!tasklet_tryunlock(t)) {
18748 +                       /*
18749 +                        * If it got disabled meanwhile, bail out:
18750 +                        */
18751 +                       if (atomic_read(&t->count))
18752 +                               goto out_disabled;
18753 +                       /*
18754 +                        * If it got scheduled meanwhile, re-execute
18755 +                        * the tasklet function:
18756 +                        */
18757 +                       if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
18758 +                               goto again;
18759 +                       if (!--loops) {
18760 +                               printk("hm, tasklet state: %08lx\n", t->state);
18761 +                               WARN_ON(1);
18762 +                               tasklet_unlock(t);
18763 +                               break;
18764 +                       }
18765 +               }
18766         }
18767  }
18768
18769 +static void tasklet_action(struct softirq_action *a)
18770 +{
18771 +       struct tasklet_struct *list;
18772 +
18773 +       local_irq_disable();
18774 +
18775 +       list = __this_cpu_read(tasklet_vec.head);
18776 +       __this_cpu_write(tasklet_vec.head, NULL);
18777 +       __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
18778 +
18779 +       local_irq_enable();
18780 +
18781 +       __tasklet_action(a, list);
18782 +}
18783 +
18784  static void tasklet_hi_action(struct softirq_action *a)
18785  {
18786         struct tasklet_struct *list;
18787
18788         local_irq_disable();
18789 +
18790         list = __this_cpu_read(tasklet_hi_vec.head);
18791         __this_cpu_write(tasklet_hi_vec.head, NULL);
18792         __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
18793 +
18794         local_irq_enable();
18795
18796 -       while (list) {
18797 -               struct tasklet_struct *t = list;
18798 -
18799 -               list = list->next;
18800 -
18801 -               if (tasklet_trylock(t)) {
18802 -                       if (!atomic_read(&t->count)) {
18803 -                               if (!test_and_clear_bit(TASKLET_STATE_SCHED,
18804 -                                                       &t->state))
18805 -                                       BUG();
18806 -                               t->func(t->data);
18807 -                               tasklet_unlock(t);
18808 -                               continue;
18809 -                       }
18810 -                       tasklet_unlock(t);
18811 -               }
18812 -
18813 -               local_irq_disable();
18814 -               t->next = NULL;
18815 -               *__this_cpu_read(tasklet_hi_vec.tail) = t;
18816 -               __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
18817 -               __raise_softirq_irqoff(HI_SOFTIRQ);
18818 -               local_irq_enable();
18819 -       }
18820 +       __tasklet_action(a, list);
18821  }
18822
18823  void tasklet_init(struct tasklet_struct *t,
18824 @@ -572,7 +1070,7 @@ void tasklet_kill(struct tasklet_struct *t)
18825
18826         while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
18827                 do {
18828 -                       yield();
18829 +                       msleep(1);
18830                 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
18831         }
18832         tasklet_unlock_wait(t);
18833 @@ -646,25 +1144,26 @@ void __init softirq_init(void)
18834         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
18835  }
18836
18837 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
18838 +void tasklet_unlock_wait(struct tasklet_struct *t)
18839 +{
18840 +       while (test_bit(TASKLET_STATE_RUN, &(t)->state)) {
18841 +               /*
18842 +                * Hack for now to avoid this busy-loop:
18843 +                */
18844 +#ifdef CONFIG_PREEMPT_RT_FULL
18845 +               msleep(1);
18846 +#else
18847 +               barrier();
18848 +#endif
18849 +       }
18850 +}
18851 +EXPORT_SYMBOL(tasklet_unlock_wait);
18852 +#endif
18853 +
18854  static int ksoftirqd_should_run(unsigned int cpu)
18855  {
18856 -       return local_softirq_pending();
18857 -}
18858 -
18859 -static void run_ksoftirqd(unsigned int cpu)
18860 -{
18861 -       local_irq_disable();
18862 -       if (local_softirq_pending()) {
18863 -               /*
18864 -                * We can safely run softirq on inline stack, as we are not deep
18865 -                * in the task stack here.
18866 -                */
18867 -               __do_softirq();
18868 -               local_irq_enable();
18869 -               cond_resched_rcu_qs();
18870 -               return;
18871 -       }
18872 -       local_irq_enable();
18873 +       return ksoftirqd_softirq_pending();
18874  }
18875
18876  #ifdef CONFIG_HOTPLUG_CPU
18877 @@ -746,16 +1245,31 @@ static struct notifier_block cpu_nfb = {
18878
18879  static struct smp_hotplug_thread softirq_threads = {
18880         .store                  = &ksoftirqd,
18881 +       .setup                  = ksoftirqd_set_sched_params,
18882         .thread_should_run      = ksoftirqd_should_run,
18883         .thread_fn              = run_ksoftirqd,
18884         .thread_comm            = "ksoftirqd/%u",
18885  };
18886
18887 +#ifdef CONFIG_PREEMPT_RT_FULL
18888 +static struct smp_hotplug_thread softirq_timer_threads = {
18889 +       .store                  = &ktimer_softirqd,
18890 +       .setup                  = ktimer_softirqd_set_sched_params,
18891 +       .cleanup                = ktimer_softirqd_clr_sched_params,
18892 +       .thread_should_run      = ktimer_softirqd_should_run,
18893 +       .thread_fn              = run_ksoftirqd,
18894 +       .thread_comm            = "ktimersoftd/%u",
18895 +};
18896 +#endif
18897 +
18898  static __init int spawn_ksoftirqd(void)
18899  {
18900         register_cpu_notifier(&cpu_nfb);
18901
18902         BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
18903 +#ifdef CONFIG_PREEMPT_RT_FULL
18904 +       BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads));
18905 +#endif
18906
18907         return 0;
18908  }
18909 diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
18910 index 4a1ca5f6da7e..3226e22b9e42 100644
18911 --- a/kernel/stop_machine.c
18912 +++ b/kernel/stop_machine.c
18913 @@ -37,7 +37,7 @@ struct cpu_stop_done {
18914  struct cpu_stopper {
18915         struct task_struct      *thread;
18916
18917 -       spinlock_t              lock;
18918 +       raw_spinlock_t          lock;
18919         bool                    enabled;        /* is this stopper enabled? */
18920         struct list_head        works;          /* list of pending works */
18921
18922 @@ -83,14 +83,14 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
18923         unsigned long flags;
18924         bool enabled;
18925
18926 -       spin_lock_irqsave(&stopper->lock, flags);
18927 +       raw_spin_lock_irqsave(&stopper->lock, flags);
18928         enabled = stopper->enabled;
18929         if (enabled)
18930                 __cpu_stop_queue_work(stopper, work);
18931         else if (work->done)
18932                 cpu_stop_signal_done(work->done);
18933 -       spin_unlock_irqrestore(&stopper->lock, flags);
18934
18935 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
18936         return enabled;
18937  }
18938
18939 @@ -232,8 +232,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
18940         int err;
18941
18942         lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
18943 -       spin_lock_irq(&stopper1->lock);
18944 -       spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
18945 +       raw_spin_lock_irq(&stopper1->lock);
18946 +       raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
18947
18948         err = -ENOENT;
18949         if (!stopper1->enabled || !stopper2->enabled)
18950 @@ -243,8 +243,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
18951         __cpu_stop_queue_work(stopper1, work1);
18952         __cpu_stop_queue_work(stopper2, work2);
18953  unlock:
18954 -       spin_unlock(&stopper2->lock);
18955 -       spin_unlock_irq(&stopper1->lock);
18956 +       raw_spin_unlock(&stopper2->lock);
18957 +       raw_spin_unlock_irq(&stopper1->lock);
18958         lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
18959
18960         return err;
18961 @@ -321,18 +321,21 @@ static DEFINE_MUTEX(stop_cpus_mutex);
18962
18963  static bool queue_stop_cpus_work(const struct cpumask *cpumask,
18964                                  cpu_stop_fn_t fn, void *arg,
18965 -                                struct cpu_stop_done *done)
18966 +                                struct cpu_stop_done *done, bool inactive)
18967  {
18968         struct cpu_stop_work *work;
18969         unsigned int cpu;
18970         bool queued = false;
18971
18972         /*
18973 -        * Disable preemption while queueing to avoid getting
18974 -        * preempted by a stopper which might wait for other stoppers
18975 -        * to enter @fn which can lead to deadlock.
18976 +        * Make sure that all work is queued on all cpus before
18977 +        * any of the cpus can execute it.
18978          */
18979 -       lg_global_lock(&stop_cpus_lock);
18980 +       if (!inactive)
18981 +               lg_global_lock(&stop_cpus_lock);
18982 +       else
18983 +               lg_global_trylock_relax(&stop_cpus_lock);
18984 +
18985         for_each_cpu(cpu, cpumask) {
18986                 work = &per_cpu(cpu_stopper.stop_work, cpu);
18987                 work->fn = fn;
18988 @@ -352,7 +355,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
18989         struct cpu_stop_done done;
18990
18991         cpu_stop_init_done(&done, cpumask_weight(cpumask));
18992 -       if (!queue_stop_cpus_work(cpumask, fn, arg, &done))
18993 +       if (!queue_stop_cpus_work(cpumask, fn, arg, &done, false))
18994                 return -ENOENT;
18995         wait_for_completion(&done.completion);
18996         return done.ret;
18997 @@ -433,9 +436,9 @@ static int cpu_stop_should_run(unsigned int cpu)
18998         unsigned long flags;
18999         int run;
19000
19001 -       spin_lock_irqsave(&stopper->lock, flags);
19002 +       raw_spin_lock_irqsave(&stopper->lock, flags);
19003         run = !list_empty(&stopper->works);
19004 -       spin_unlock_irqrestore(&stopper->lock, flags);
19005 +       raw_spin_unlock_irqrestore(&stopper->lock, flags);
19006         return run;
19007  }
19008
19009 @@ -446,13 +449,13 @@ static void cpu_stopper_thread(unsigned int cpu)
19010
19011  repeat:
19012         work = NULL;
19013 -       spin_lock_irq(&stopper->lock);
19014 +       raw_spin_lock_irq(&stopper->lock);
19015         if (!list_empty(&stopper->works)) {
19016                 work = list_first_entry(&stopper->works,
19017                                         struct cpu_stop_work, list);
19018                 list_del_init(&work->list);
19019         }
19020 -       spin_unlock_irq(&stopper->lock);
19021 +       raw_spin_unlock_irq(&stopper->lock);
19022
19023         if (work) {
19024                 cpu_stop_fn_t fn = work->fn;
19025 @@ -460,6 +463,16 @@ static void cpu_stopper_thread(unsigned int cpu)
19026                 struct cpu_stop_done *done = work->done;
19027                 int ret;
19028
19029 +               /*
19030 +                * Wait until the stopper finished scheduling on all
19031 +                * cpus
19032 +                */
19033 +               lg_global_lock(&stop_cpus_lock);
19034 +               /*
19035 +                * Let other cpu threads continue as well
19036 +                */
19037 +               lg_global_unlock(&stop_cpus_lock);
19038 +
19039                 /* cpu stop callbacks must not sleep, make in_atomic() == T */
19040                 preempt_count_inc();
19041                 ret = fn(arg);
19042 @@ -526,10 +539,12 @@ static int __init cpu_stop_init(void)
19043         for_each_possible_cpu(cpu) {
19044                 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
19045
19046 -               spin_lock_init(&stopper->lock);
19047 +               raw_spin_lock_init(&stopper->lock);
19048                 INIT_LIST_HEAD(&stopper->works);
19049         }
19050
19051 +       lg_lock_init(&stop_cpus_lock, "stop_cpus_lock");
19052 +
19053         BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
19054         stop_machine_unpark(raw_smp_processor_id());
19055         stop_machine_initialized = true;
19056 @@ -624,7 +639,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
19057         set_state(&msdata, MULTI_STOP_PREPARE);
19058         cpu_stop_init_done(&done, num_active_cpus());
19059         queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
19060 -                            &done);
19061 +                            &done, true);
19062         ret = multi_cpu_stop(&msdata);
19063
19064         /* Busy wait for completion. */
19065 diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
19066 index 9ba7c820fc23..d85f638fd99e 100644
19067 --- a/kernel/time/hrtimer.c
19068 +++ b/kernel/time/hrtimer.c
19069 @@ -53,6 +53,7 @@
19070  #include <asm/uaccess.h>
19071
19072  #include <trace/events/timer.h>
19073 +#include <trace/events/hist.h>
19074
19075  #include "tick-internal.h"
19076
19077 @@ -695,6 +696,29 @@ static void hrtimer_switch_to_hres(void)
19078         retrigger_next_event(NULL);
19079  }
19080
19081 +#ifdef CONFIG_PREEMPT_RT_FULL
19082 +
19083 +static struct swork_event clock_set_delay_work;
19084 +
19085 +static void run_clock_set_delay(struct swork_event *event)
19086 +{
19087 +       clock_was_set();
19088 +}
19089 +
19090 +void clock_was_set_delayed(void)
19091 +{
19092 +       swork_queue(&clock_set_delay_work);
19093 +}
19094 +
19095 +static __init int create_clock_set_delay_thread(void)
19096 +{
19097 +       WARN_ON(swork_get());
19098 +       INIT_SWORK(&clock_set_delay_work, run_clock_set_delay);
19099 +       return 0;
19100 +}
19101 +early_initcall(create_clock_set_delay_thread);
19102 +#else /* PREEMPT_RT_FULL */
19103 +
19104  static void clock_was_set_work(struct work_struct *work)
19105  {
19106         clock_was_set();
19107 @@ -710,6 +734,7 @@ void clock_was_set_delayed(void)
19108  {
19109         schedule_work(&hrtimer_work);
19110  }
19111 +#endif
19112
19113  #else
19114
19115 @@ -719,11 +744,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
19116  static inline void hrtimer_switch_to_hres(void) { }
19117  static inline void
19118  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
19119 -static inline int hrtimer_reprogram(struct hrtimer *timer,
19120 -                                   struct hrtimer_clock_base *base)
19121 -{
19122 -       return 0;
19123 -}
19124 +static inline void hrtimer_reprogram(struct hrtimer *timer,
19125 +                                    struct hrtimer_clock_base *base) { }
19126  static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
19127  static inline void retrigger_next_event(void *arg) { }
19128
19129 @@ -855,6 +877,32 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
19130  }
19131  EXPORT_SYMBOL_GPL(hrtimer_forward);
19132
19133 +#ifdef CONFIG_PREEMPT_RT_BASE
19134 +# define wake_up_timer_waiters(b)      wake_up(&(b)->wait)
19135 +
19136 +/**
19137 + * hrtimer_wait_for_timer - Wait for a running timer
19138 + *
19139 + * @timer:     timer to wait for
19140 + *
19141 + * The function waits in case the timers callback function is
19142 + * currently executed on the waitqueue of the timer base. The
19143 + * waitqueue is woken up after the timer callback function has
19144 + * finished execution.
19145 + */
19146 +void hrtimer_wait_for_timer(const struct hrtimer *timer)
19147 +{
19148 +       struct hrtimer_clock_base *base = timer->base;
19149 +
19150 +       if (base && base->cpu_base && !timer->irqsafe)
19151 +               wait_event(base->cpu_base->wait,
19152 +                               !(hrtimer_callback_running(timer)));
19153 +}
19154 +
19155 +#else
19156 +# define wake_up_timer_waiters(b)      do { } while (0)
19157 +#endif
19158 +
19159  /*
19160   * enqueue_hrtimer - internal function to (re)start a timer
19161   *
19162 @@ -896,6 +944,11 @@ static void __remove_hrtimer(struct hrtimer *timer,
19163         if (!(state & HRTIMER_STATE_ENQUEUED))
19164                 return;
19165
19166 +       if (unlikely(!list_empty(&timer->cb_entry))) {
19167 +               list_del_init(&timer->cb_entry);
19168 +               return;
19169 +       }
19170 +
19171         if (!timerqueue_del(&base->active, &timer->node))
19172                 cpu_base->active_bases &= ~(1 << base->index);
19173
19174 @@ -991,7 +1044,16 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
19175         new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
19176
19177         timer_stats_hrtimer_set_start_info(timer);
19178 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
19179 +       {
19180 +               ktime_t now = new_base->get_time();
19181
19182 +               if (ktime_to_ns(tim) < ktime_to_ns(now))
19183 +                       timer->praecox = now;
19184 +               else
19185 +                       timer->praecox = ktime_set(0, 0);
19186 +       }
19187 +#endif
19188         leftmost = enqueue_hrtimer(timer, new_base);
19189         if (!leftmost)
19190                 goto unlock;
19191 @@ -1063,7 +1125,7 @@ int hrtimer_cancel(struct hrtimer *timer)
19192
19193                 if (ret >= 0)
19194                         return ret;
19195 -               cpu_relax();
19196 +               hrtimer_wait_for_timer(timer);
19197         }
19198  }
19199  EXPORT_SYMBOL_GPL(hrtimer_cancel);
19200 @@ -1127,6 +1189,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
19201
19202         base = hrtimer_clockid_to_base(clock_id);
19203         timer->base = &cpu_base->clock_base[base];
19204 +       INIT_LIST_HEAD(&timer->cb_entry);
19205         timerqueue_init(&timer->node);
19206
19207  #ifdef CONFIG_TIMER_STATS
19208 @@ -1167,6 +1230,7 @@ bool hrtimer_active(const struct hrtimer *timer)
19209                 seq = raw_read_seqcount_begin(&cpu_base->seq);
19210
19211                 if (timer->state != HRTIMER_STATE_INACTIVE ||
19212 +                   cpu_base->running_soft == timer ||
19213                     cpu_base->running == timer)
19214                         return true;
19215
19216 @@ -1265,10 +1329,112 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
19217         cpu_base->running = NULL;
19218  }
19219
19220 +#ifdef CONFIG_PREEMPT_RT_BASE
19221 +static void hrtimer_rt_reprogram(int restart, struct hrtimer *timer,
19222 +                                struct hrtimer_clock_base *base)
19223 +{
19224 +       int leftmost;
19225 +
19226 +       if (restart != HRTIMER_NORESTART &&
19227 +           !(timer->state & HRTIMER_STATE_ENQUEUED)) {
19228 +
19229 +               leftmost = enqueue_hrtimer(timer, base);
19230 +               if (!leftmost)
19231 +                       return;
19232 +#ifdef CONFIG_HIGH_RES_TIMERS
19233 +               if (!hrtimer_is_hres_active(timer)) {
19234 +                       /*
19235 +                        * Kick to reschedule the next tick to handle the new timer
19236 +                        * on dynticks target.
19237 +                        */
19238 +                       if (base->cpu_base->nohz_active)
19239 +                               wake_up_nohz_cpu(base->cpu_base->cpu);
19240 +               } else {
19241 +
19242 +                       hrtimer_reprogram(timer, base);
19243 +               }
19244 +#endif
19245 +       }
19246 +}
19247 +
19248 +/*
19249 + * The changes in mainline which removed the callback modes from
19250 + * hrtimer are not yet working with -rt. The non wakeup_process()
19251 + * based callbacks which involve sleeping locks need to be treated
19252 + * seperately.
19253 + */
19254 +static void hrtimer_rt_run_pending(void)
19255 +{
19256 +       enum hrtimer_restart (*fn)(struct hrtimer *);
19257 +       struct hrtimer_cpu_base *cpu_base;
19258 +       struct hrtimer_clock_base *base;
19259 +       struct hrtimer *timer;
19260 +       int index, restart;
19261 +
19262 +       local_irq_disable();
19263 +       cpu_base = &per_cpu(hrtimer_bases, smp_processor_id());
19264 +
19265 +       raw_spin_lock(&cpu_base->lock);
19266 +
19267 +       for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
19268 +               base = &cpu_base->clock_base[index];
19269 +
19270 +               while (!list_empty(&base->expired)) {
19271 +                       timer = list_first_entry(&base->expired,
19272 +                                                struct hrtimer, cb_entry);
19273 +
19274 +                       /*
19275 +                        * Same as the above __run_hrtimer function
19276 +                        * just we run with interrupts enabled.
19277 +                        */
19278 +                       debug_deactivate(timer);
19279 +                       cpu_base->running_soft = timer;
19280 +                       raw_write_seqcount_barrier(&cpu_base->seq);
19281 +
19282 +                       __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
19283 +                       timer_stats_account_hrtimer(timer);
19284 +                       fn = timer->function;
19285 +
19286 +                       raw_spin_unlock_irq(&cpu_base->lock);
19287 +                       restart = fn(timer);
19288 +                       raw_spin_lock_irq(&cpu_base->lock);
19289 +
19290 +                       hrtimer_rt_reprogram(restart, timer, base);
19291 +                       raw_write_seqcount_barrier(&cpu_base->seq);
19292 +
19293 +                       WARN_ON_ONCE(cpu_base->running_soft != timer);
19294 +                       cpu_base->running_soft = NULL;
19295 +               }
19296 +       }
19297 +
19298 +       raw_spin_unlock_irq(&cpu_base->lock);
19299 +
19300 +       wake_up_timer_waiters(cpu_base);
19301 +}
19302 +
19303 +static int hrtimer_rt_defer(struct hrtimer *timer)
19304 +{
19305 +       if (timer->irqsafe)
19306 +               return 0;
19307 +
19308 +       __remove_hrtimer(timer, timer->base, timer->state, 0);
19309 +       list_add_tail(&timer->cb_entry, &timer->base->expired);
19310 +       return 1;
19311 +}
19312 +
19313 +#else
19314 +
19315 +static inline int hrtimer_rt_defer(struct hrtimer *timer) { return 0; }
19316 +
19317 +#endif
19318 +
19319 +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer);
19320 +
19321  static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19322  {
19323         struct hrtimer_clock_base *base = cpu_base->clock_base;
19324         unsigned int active = cpu_base->active_bases;
19325 +       int raise = 0;
19326
19327         for (; active; base++, active >>= 1) {
19328                 struct timerqueue_node *node;
19329 @@ -1284,6 +1450,15 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19330
19331                         timer = container_of(node, struct hrtimer, node);
19332
19333 +                       trace_hrtimer_interrupt(raw_smp_processor_id(),
19334 +                           ktime_to_ns(ktime_sub(ktime_to_ns(timer->praecox) ?
19335 +                               timer->praecox : hrtimer_get_expires(timer),
19336 +                               basenow)),
19337 +                           current,
19338 +                           timer->function == hrtimer_wakeup ?
19339 +                           container_of(timer, struct hrtimer_sleeper,
19340 +                               timer)->task : NULL);
19341 +
19342                         /*
19343                          * The immediate goal for using the softexpires is
19344                          * minimizing wakeups, not running timers at the
19345 @@ -1299,9 +1474,14 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
19346                         if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
19347                                 break;
19348
19349 -                       __run_hrtimer(cpu_base, base, timer, &basenow);
19350 +                       if (!hrtimer_rt_defer(timer))
19351 +                               __run_hrtimer(cpu_base, base, timer, &basenow);
19352 +                       else
19353 +                               raise = 1;
19354                 }
19355         }
19356 +       if (raise)
19357 +               raise_softirq_irqoff(HRTIMER_SOFTIRQ);
19358  }
19359
19360  #ifdef CONFIG_HIGH_RES_TIMERS
19361 @@ -1464,16 +1644,18 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
19362  void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
19363  {
19364         sl->timer.function = hrtimer_wakeup;
19365 +       sl->timer.irqsafe = 1;
19366         sl->task = task;
19367  }
19368  EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
19369
19370 -static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
19371 +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode,
19372 +                               unsigned long state)
19373  {
19374         hrtimer_init_sleeper(t, current);
19375
19376         do {
19377 -               set_current_state(TASK_INTERRUPTIBLE);
19378 +               set_current_state(state);
19379                 hrtimer_start_expires(&t->timer, mode);
19380
19381                 if (likely(t->task))
19382 @@ -1515,7 +1697,8 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
19383                                 HRTIMER_MODE_ABS);
19384         hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
19385
19386 -       if (do_nanosleep(&t, HRTIMER_MODE_ABS))
19387 +       /* cpu_chill() does not care about restart state. */
19388 +       if (do_nanosleep(&t, HRTIMER_MODE_ABS, TASK_INTERRUPTIBLE))
19389                 goto out;
19390
19391         rmtp = restart->nanosleep.rmtp;
19392 @@ -1532,8 +1715,10 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
19393         return ret;
19394  }
19395
19396 -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19397 -                      const enum hrtimer_mode mode, const clockid_t clockid)
19398 +static long
19399 +__hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19400 +                   const enum hrtimer_mode mode, const clockid_t clockid,
19401 +                   unsigned long state)
19402  {
19403         struct restart_block *restart;
19404         struct hrtimer_sleeper t;
19405 @@ -1546,7 +1731,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19406
19407         hrtimer_init_on_stack(&t.timer, clockid, mode);
19408         hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
19409 -       if (do_nanosleep(&t, mode))
19410 +       if (do_nanosleep(&t, mode, state))
19411                 goto out;
19412
19413         /* Absolute timers do not update the rmtp value and restart: */
19414 @@ -1573,6 +1758,12 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19415         return ret;
19416  }
19417
19418 +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
19419 +                      const enum hrtimer_mode mode, const clockid_t clockid)
19420 +{
19421 +       return __hrtimer_nanosleep(rqtp, rmtp, mode, clockid, TASK_INTERRUPTIBLE);
19422 +}
19423 +
19424  SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
19425                 struct timespec __user *, rmtp)
19426  {
19427 @@ -1587,6 +1778,26 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
19428         return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
19429  }
19430
19431 +#ifdef CONFIG_PREEMPT_RT_FULL
19432 +/*
19433 + * Sleep for 1 ms in hope whoever holds what we want will let it go.
19434 + */
19435 +void cpu_chill(void)
19436 +{
19437 +       struct timespec tu = {
19438 +               .tv_nsec = NSEC_PER_MSEC,
19439 +       };
19440 +       unsigned int freeze_flag = current->flags & PF_NOFREEZE;
19441 +
19442 +       current->flags |= PF_NOFREEZE;
19443 +       __hrtimer_nanosleep(&tu, NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC,
19444 +                           TASK_UNINTERRUPTIBLE);
19445 +       if (!freeze_flag)
19446 +               current->flags &= ~PF_NOFREEZE;
19447 +}
19448 +EXPORT_SYMBOL(cpu_chill);
19449 +#endif
19450 +
19451  /*
19452   * Functions related to boot-time initialization:
19453   */
19454 @@ -1598,10 +1809,14 @@ int hrtimers_prepare_cpu(unsigned int cpu)
19455         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
19456                 cpu_base->clock_base[i].cpu_base = cpu_base;
19457                 timerqueue_init_head(&cpu_base->clock_base[i].active);
19458 +               INIT_LIST_HEAD(&cpu_base->clock_base[i].expired);
19459         }
19460
19461         cpu_base->cpu = cpu;
19462         hrtimer_init_hres(cpu_base);
19463 +#ifdef CONFIG_PREEMPT_RT_BASE
19464 +       init_waitqueue_head(&cpu_base->wait);
19465 +#endif
19466         return 0;
19467  }
19468
19469 @@ -1671,9 +1886,26 @@ int hrtimers_dead_cpu(unsigned int scpu)
19470
19471  #endif /* CONFIG_HOTPLUG_CPU */
19472
19473 +#ifdef CONFIG_PREEMPT_RT_BASE
19474 +
19475 +static void run_hrtimer_softirq(struct softirq_action *h)
19476 +{
19477 +       hrtimer_rt_run_pending();
19478 +}
19479 +
19480 +static void hrtimers_open_softirq(void)
19481 +{
19482 +       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
19483 +}
19484 +
19485 +#else
19486 +static void hrtimers_open_softirq(void) { }
19487 +#endif
19488 +
19489  void __init hrtimers_init(void)
19490  {
19491         hrtimers_prepare_cpu(smp_processor_id());
19492 +       hrtimers_open_softirq();
19493  }
19494
19495  /**
19496 diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
19497 index 1d5c7204ddc9..184de6751180 100644
19498 --- a/kernel/time/itimer.c
19499 +++ b/kernel/time/itimer.c
19500 @@ -213,6 +213,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
19501                 /* We are sharing ->siglock with it_real_fn() */
19502                 if (hrtimer_try_to_cancel(timer) < 0) {
19503                         spin_unlock_irq(&tsk->sighand->siglock);
19504 +                       hrtimer_wait_for_timer(&tsk->signal->real_timer);
19505                         goto again;
19506                 }
19507                 expires = timeval_to_ktime(value->it_value);
19508 diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
19509 index 555e21f7b966..a5d6435fabbb 100644
19510 --- a/kernel/time/jiffies.c
19511 +++ b/kernel/time/jiffies.c
19512 @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = {
19513         .max_cycles     = 10,
19514  };
19515
19516 -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
19517 +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
19518 +__cacheline_aligned_in_smp seqcount_t jiffies_seq;
19519
19520  #if (BITS_PER_LONG < 64)
19521  u64 get_jiffies_64(void)
19522 @@ -83,9 +84,9 @@ u64 get_jiffies_64(void)
19523         u64 ret;
19524
19525         do {
19526 -               seq = read_seqbegin(&jiffies_lock);
19527 +               seq = read_seqcount_begin(&jiffies_seq);
19528                 ret = jiffies_64;
19529 -       } while (read_seqretry(&jiffies_lock, seq));
19530 +       } while (read_seqcount_retry(&jiffies_seq, seq));
19531         return ret;
19532  }
19533  EXPORT_SYMBOL(get_jiffies_64);
19534 diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
19535 index 6df8927c58a5..05b7391bf9bd 100644
19536 --- a/kernel/time/ntp.c
19537 +++ b/kernel/time/ntp.c
19538 @@ -17,6 +17,7 @@
19539  #include <linux/module.h>
19540  #include <linux/rtc.h>
19541  #include <linux/math64.h>
19542 +#include <linux/swork.h>
19543
19544  #include "ntp_internal.h"
19545  #include "timekeeping_internal.h"
19546 @@ -568,10 +569,35 @@ static void sync_cmos_clock(struct work_struct *work)
19547                            &sync_cmos_work, timespec64_to_jiffies(&next));
19548  }
19549
19550 +#ifdef CONFIG_PREEMPT_RT_FULL
19551 +
19552 +static void run_clock_set_delay(struct swork_event *event)
19553 +{
19554 +       queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
19555 +}
19556 +
19557 +static struct swork_event ntp_cmos_swork;
19558 +
19559 +void ntp_notify_cmos_timer(void)
19560 +{
19561 +       swork_queue(&ntp_cmos_swork);
19562 +}
19563 +
19564 +static __init int create_cmos_delay_thread(void)
19565 +{
19566 +       WARN_ON(swork_get());
19567 +       INIT_SWORK(&ntp_cmos_swork, run_clock_set_delay);
19568 +       return 0;
19569 +}
19570 +early_initcall(create_cmos_delay_thread);
19571 +
19572 +#else
19573 +
19574  void ntp_notify_cmos_timer(void)
19575  {
19576         queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
19577  }
19578 +#endif /* CONFIG_PREEMPT_RT_FULL */
19579
19580  #else
19581  void ntp_notify_cmos_timer(void) { }
19582 diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
19583 index 39008d78927a..633f4eaca9e7 100644
19584 --- a/kernel/time/posix-cpu-timers.c
19585 +++ b/kernel/time/posix-cpu-timers.c
19586 @@ -3,6 +3,7 @@
19587   */
19588
19589  #include <linux/sched.h>
19590 +#include <linux/sched/rt.h>
19591  #include <linux/posix-timers.h>
19592  #include <linux/errno.h>
19593  #include <linux/math64.h>
19594 @@ -620,7 +621,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
19595         /*
19596          * Disarm any old timer after extracting its expiry time.
19597          */
19598 -       WARN_ON_ONCE(!irqs_disabled());
19599 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19600
19601         ret = 0;
19602         old_incr = timer->it.cpu.incr;
19603 @@ -1064,7 +1065,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
19604         /*
19605          * Now re-arm for the new expiry time.
19606          */
19607 -       WARN_ON_ONCE(!irqs_disabled());
19608 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19609         arm_timer(timer);
19610         unlock_task_sighand(p, &flags);
19611
19612 @@ -1153,13 +1154,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
19613   * already updated our counts.  We need to check if any timers fire now.
19614   * Interrupts are disabled.
19615   */
19616 -void run_posix_cpu_timers(struct task_struct *tsk)
19617 +static void __run_posix_cpu_timers(struct task_struct *tsk)
19618  {
19619         LIST_HEAD(firing);
19620         struct k_itimer *timer, *next;
19621         unsigned long flags;
19622
19623 -       WARN_ON_ONCE(!irqs_disabled());
19624 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
19625
19626         /*
19627          * The fast path checks that there are no expired thread or thread
19628 @@ -1213,6 +1214,190 @@ void run_posix_cpu_timers(struct task_struct *tsk)
19629         }
19630  }
19631
19632 +#ifdef CONFIG_PREEMPT_RT_BASE
19633 +#include <linux/kthread.h>
19634 +#include <linux/cpu.h>
19635 +DEFINE_PER_CPU(struct task_struct *, posix_timer_task);
19636 +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist);
19637 +
19638 +static int posix_cpu_timers_thread(void *data)
19639 +{
19640 +       int cpu = (long)data;
19641 +
19642 +       BUG_ON(per_cpu(posix_timer_task,cpu) != current);
19643 +
19644 +       while (!kthread_should_stop()) {
19645 +               struct task_struct *tsk = NULL;
19646 +               struct task_struct *next = NULL;
19647 +
19648 +               if (cpu_is_offline(cpu))
19649 +                       goto wait_to_die;
19650 +
19651 +               /* grab task list */
19652 +               raw_local_irq_disable();
19653 +               tsk = per_cpu(posix_timer_tasklist, cpu);
19654 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
19655 +               raw_local_irq_enable();
19656 +
19657 +               /* its possible the list is empty, just return */
19658 +               if (!tsk) {
19659 +                       set_current_state(TASK_INTERRUPTIBLE);
19660 +                       schedule();
19661 +                       __set_current_state(TASK_RUNNING);
19662 +                       continue;
19663 +               }
19664 +
19665 +               /* Process task list */
19666 +               while (1) {
19667 +                       /* save next */
19668 +                       next = tsk->posix_timer_list;
19669 +
19670 +                       /* run the task timers, clear its ptr and
19671 +                        * unreference it
19672 +                        */
19673 +                       __run_posix_cpu_timers(tsk);
19674 +                       tsk->posix_timer_list = NULL;
19675 +                       put_task_struct(tsk);
19676 +
19677 +                       /* check if this is the last on the list */
19678 +                       if (next == tsk)
19679 +                               break;
19680 +                       tsk = next;
19681 +               }
19682 +       }
19683 +       return 0;
19684 +
19685 +wait_to_die:
19686 +       /* Wait for kthread_stop */
19687 +       set_current_state(TASK_INTERRUPTIBLE);
19688 +       while (!kthread_should_stop()) {
19689 +               schedule();
19690 +               set_current_state(TASK_INTERRUPTIBLE);
19691 +       }
19692 +       __set_current_state(TASK_RUNNING);
19693 +       return 0;
19694 +}
19695 +
19696 +static inline int __fastpath_timer_check(struct task_struct *tsk)
19697 +{
19698 +       /* tsk == current, ensure it is safe to use ->signal/sighand */
19699 +       if (unlikely(tsk->exit_state))
19700 +               return 0;
19701 +
19702 +       if (!task_cputime_zero(&tsk->cputime_expires))
19703 +                       return 1;
19704 +
19705 +       if (!task_cputime_zero(&tsk->signal->cputime_expires))
19706 +                       return 1;
19707 +
19708 +       return 0;
19709 +}
19710 +
19711 +void run_posix_cpu_timers(struct task_struct *tsk)
19712 +{
19713 +       unsigned long cpu = smp_processor_id();
19714 +       struct task_struct *tasklist;
19715 +
19716 +       BUG_ON(!irqs_disabled());
19717 +       if(!per_cpu(posix_timer_task, cpu))
19718 +               return;
19719 +       /* get per-cpu references */
19720 +       tasklist = per_cpu(posix_timer_tasklist, cpu);
19721 +
19722 +       /* check to see if we're already queued */
19723 +       if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) {
19724 +               get_task_struct(tsk);
19725 +               if (tasklist) {
19726 +                       tsk->posix_timer_list = tasklist;
19727 +               } else {
19728 +                       /*
19729 +                        * The list is terminated by a self-pointing
19730 +                        * task_struct
19731 +                        */
19732 +                       tsk->posix_timer_list = tsk;
19733 +               }
19734 +               per_cpu(posix_timer_tasklist, cpu) = tsk;
19735 +
19736 +               wake_up_process(per_cpu(posix_timer_task, cpu));
19737 +       }
19738 +}
19739 +
19740 +/*
19741 + * posix_cpu_thread_call - callback that gets triggered when a CPU is added.
19742 + * Here we can start up the necessary migration thread for the new CPU.
19743 + */
19744 +static int posix_cpu_thread_call(struct notifier_block *nfb,
19745 +                                unsigned long action, void *hcpu)
19746 +{
19747 +       int cpu = (long)hcpu;
19748 +       struct task_struct *p;
19749 +       struct sched_param param;
19750 +
19751 +       switch (action) {
19752 +       case CPU_UP_PREPARE:
19753 +               p = kthread_create(posix_cpu_timers_thread, hcpu,
19754 +                                       "posixcputmr/%d",cpu);
19755 +               if (IS_ERR(p))
19756 +                       return NOTIFY_BAD;
19757 +               p->flags |= PF_NOFREEZE;
19758 +               kthread_bind(p, cpu);
19759 +               /* Must be high prio to avoid getting starved */
19760 +               param.sched_priority = MAX_RT_PRIO-1;
19761 +               sched_setscheduler(p, SCHED_FIFO, &param);
19762 +               per_cpu(posix_timer_task,cpu) = p;
19763 +               break;
19764 +       case CPU_ONLINE:
19765 +               /* Strictly unneccessary, as first user will wake it. */
19766 +               wake_up_process(per_cpu(posix_timer_task,cpu));
19767 +               break;
19768 +#ifdef CONFIG_HOTPLUG_CPU
19769 +       case CPU_UP_CANCELED:
19770 +               /* Unbind it from offline cpu so it can run.  Fall thru. */
19771 +               kthread_bind(per_cpu(posix_timer_task, cpu),
19772 +                            cpumask_any(cpu_online_mask));
19773 +               kthread_stop(per_cpu(posix_timer_task,cpu));
19774 +               per_cpu(posix_timer_task,cpu) = NULL;
19775 +               break;
19776 +       case CPU_DEAD:
19777 +               kthread_stop(per_cpu(posix_timer_task,cpu));
19778 +               per_cpu(posix_timer_task,cpu) = NULL;
19779 +               break;
19780 +#endif
19781 +       }
19782 +       return NOTIFY_OK;
19783 +}
19784 +
19785 +/* Register at highest priority so that task migration (migrate_all_tasks)
19786 + * happens before everything else.
19787 + */
19788 +static struct notifier_block posix_cpu_thread_notifier = {
19789 +       .notifier_call = posix_cpu_thread_call,
19790 +       .priority = 10
19791 +};
19792 +
19793 +static int __init posix_cpu_thread_init(void)
19794 +{
19795 +       void *hcpu = (void *)(long)smp_processor_id();
19796 +       /* Start one for boot CPU. */
19797 +       unsigned long cpu;
19798 +
19799 +       /* init the per-cpu posix_timer_tasklets */
19800 +       for_each_possible_cpu(cpu)
19801 +               per_cpu(posix_timer_tasklist, cpu) = NULL;
19802 +
19803 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, hcpu);
19804 +       posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, hcpu);
19805 +       register_cpu_notifier(&posix_cpu_thread_notifier);
19806 +       return 0;
19807 +}
19808 +early_initcall(posix_cpu_thread_init);
19809 +#else /* CONFIG_PREEMPT_RT_BASE */
19810 +void run_posix_cpu_timers(struct task_struct *tsk)
19811 +{
19812 +       __run_posix_cpu_timers(tsk);
19813 +}
19814 +#endif /* CONFIG_PREEMPT_RT_BASE */
19815 +
19816  /*
19817   * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
19818   * The tsk->sighand->siglock must be held by the caller.
19819 diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
19820 index f2826c35e918..464a98155a0e 100644
19821 --- a/kernel/time/posix-timers.c
19822 +++ b/kernel/time/posix-timers.c
19823 @@ -506,6 +506,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
19824  static struct pid *good_sigevent(sigevent_t * event)
19825  {
19826         struct task_struct *rtn = current->group_leader;
19827 +       int sig = event->sigev_signo;
19828
19829         if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
19830                 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
19831 @@ -514,7 +515,8 @@ static struct pid *good_sigevent(sigevent_t * event)
19832                 return NULL;
19833
19834         if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
19835 -           ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
19836 +           (sig <= 0 || sig > SIGRTMAX || sig_kernel_only(sig) ||
19837 +            sig_kernel_coredump(sig)))
19838                 return NULL;
19839
19840         return task_pid(rtn);
19841 @@ -826,6 +828,20 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
19842         return overrun;
19843  }
19844
19845 +/*
19846 + * Protected by RCU!
19847 + */
19848 +static void timer_wait_for_callback(struct k_clock *kc, struct k_itimer *timr)
19849 +{
19850 +#ifdef CONFIG_PREEMPT_RT_FULL
19851 +       if (kc->timer_set == common_timer_set)
19852 +               hrtimer_wait_for_timer(&timr->it.real.timer);
19853 +       else
19854 +               /* FIXME: Whacky hack for posix-cpu-timers */
19855 +               schedule_timeout(1);
19856 +#endif
19857 +}
19858 +
19859  /* Set a POSIX.1b interval timer. */
19860  /* timr->it_lock is taken. */
19861  static int
19862 @@ -903,6 +919,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
19863         if (!timr)
19864                 return -EINVAL;
19865
19866 +       rcu_read_lock();
19867         kc = clockid_to_kclock(timr->it_clock);
19868         if (WARN_ON_ONCE(!kc || !kc->timer_set))
19869                 error = -EINVAL;
19870 @@ -911,9 +928,12 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
19871
19872         unlock_timer(timr, flag);
19873         if (error == TIMER_RETRY) {
19874 +               timer_wait_for_callback(kc, timr);
19875                 rtn = NULL;     // We already got the old time...
19876 +               rcu_read_unlock();
19877                 goto retry;
19878         }
19879 +       rcu_read_unlock();
19880
19881         if (old_setting && !error &&
19882             copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
19883 @@ -951,10 +971,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
19884         if (!timer)
19885                 return -EINVAL;
19886
19887 +       rcu_read_lock();
19888         if (timer_delete_hook(timer) == TIMER_RETRY) {
19889                 unlock_timer(timer, flags);
19890 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
19891 +                                       timer);
19892 +               rcu_read_unlock();
19893                 goto retry_delete;
19894         }
19895 +       rcu_read_unlock();
19896
19897         spin_lock(&current->sighand->siglock);
19898         list_del(&timer->list);
19899 @@ -980,8 +1005,18 @@ static void itimer_delete(struct k_itimer *timer)
19900  retry_delete:
19901         spin_lock_irqsave(&timer->it_lock, flags);
19902
19903 -       if (timer_delete_hook(timer) == TIMER_RETRY) {
19904 +       /* On RT we can race with a deletion */
19905 +       if (!timer->it_signal) {
19906                 unlock_timer(timer, flags);
19907 +               return;
19908 +       }
19909 +
19910 +       if (timer_delete_hook(timer) == TIMER_RETRY) {
19911 +               rcu_read_lock();
19912 +               unlock_timer(timer, flags);
19913 +               timer_wait_for_callback(clockid_to_kclock(timer->it_clock),
19914 +                                       timer);
19915 +               rcu_read_unlock();
19916                 goto retry_delete;
19917         }
19918         list_del(&timer->list);
19919 diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
19920 index 690b797f522e..fe8ba1619879 100644
19921 --- a/kernel/time/tick-broadcast-hrtimer.c
19922 +++ b/kernel/time/tick-broadcast-hrtimer.c
19923 @@ -107,5 +107,6 @@ void tick_setup_hrtimer_broadcast(void)
19924  {
19925         hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
19926         bctimer.function = bc_handler;
19927 +       bctimer.irqsafe = true;
19928         clockevents_register_device(&ce_broadcast_hrtimer);
19929  }
19930 diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
19931 index 4fcd99e12aa0..5a47f2e98faf 100644
19932 --- a/kernel/time/tick-common.c
19933 +++ b/kernel/time/tick-common.c
19934 @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void)
19935  static void tick_periodic(int cpu)
19936  {
19937         if (tick_do_timer_cpu == cpu) {
19938 -               write_seqlock(&jiffies_lock);
19939 +               raw_spin_lock(&jiffies_lock);
19940 +               write_seqcount_begin(&jiffies_seq);
19941
19942                 /* Keep track of the next tick event */
19943                 tick_next_period = ktime_add(tick_next_period, tick_period);
19944
19945                 do_timer(1);
19946 -               write_sequnlock(&jiffies_lock);
19947 +               write_seqcount_end(&jiffies_seq);
19948 +               raw_spin_unlock(&jiffies_lock);
19949                 update_wall_time();
19950         }
19951
19952 @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
19953                 ktime_t next;
19954
19955                 do {
19956 -                       seq = read_seqbegin(&jiffies_lock);
19957 +                       seq = read_seqcount_begin(&jiffies_seq);
19958                         next = tick_next_period;
19959 -               } while (read_seqretry(&jiffies_lock, seq));
19960 +               } while (read_seqcount_retry(&jiffies_seq, seq));
19961
19962                 clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
19963
19964 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
19965 index 2ec7c00228f3..c1b30b8c671a 100644
19966 --- a/kernel/time/tick-sched.c
19967 +++ b/kernel/time/tick-sched.c
19968 @@ -62,7 +62,8 @@ static void tick_do_update_jiffies64(ktime_t now)
19969                 return;
19970
19971         /* Reevaluate with jiffies_lock held */
19972 -       write_seqlock(&jiffies_lock);
19973 +       raw_spin_lock(&jiffies_lock);
19974 +       write_seqcount_begin(&jiffies_seq);
19975
19976         delta = ktime_sub(now, last_jiffies_update);
19977         if (delta.tv64 >= tick_period.tv64) {
19978 @@ -85,10 +86,12 @@ static void tick_do_update_jiffies64(ktime_t now)
19979                 /* Keep the tick_next_period variable up to date */
19980                 tick_next_period = ktime_add(last_jiffies_update, tick_period);
19981         } else {
19982 -               write_sequnlock(&jiffies_lock);
19983 +               write_seqcount_end(&jiffies_seq);
19984 +               raw_spin_unlock(&jiffies_lock);
19985                 return;
19986         }
19987 -       write_sequnlock(&jiffies_lock);
19988 +       write_seqcount_end(&jiffies_seq);
19989 +       raw_spin_unlock(&jiffies_lock);
19990         update_wall_time();
19991  }
19992
19993 @@ -99,12 +102,14 @@ static ktime_t tick_init_jiffy_update(void)
19994  {
19995         ktime_t period;
19996
19997 -       write_seqlock(&jiffies_lock);
19998 +       raw_spin_lock(&jiffies_lock);
19999 +       write_seqcount_begin(&jiffies_seq);
20000         /* Did we start the jiffies update yet ? */
20001         if (last_jiffies_update.tv64 == 0)
20002                 last_jiffies_update = tick_next_period;
20003         period = last_jiffies_update;
20004 -       write_sequnlock(&jiffies_lock);
20005 +       write_seqcount_end(&jiffies_seq);
20006 +       raw_spin_unlock(&jiffies_lock);
20007         return period;
20008  }
20009
20010 @@ -212,6 +217,7 @@ static void nohz_full_kick_func(struct irq_work *work)
20011
20012  static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
20013         .func = nohz_full_kick_func,
20014 +       .flags = IRQ_WORK_HARD_IRQ,
20015  };
20016
20017  /*
20018 @@ -670,10 +676,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
20019
20020         /* Read jiffies and the time when jiffies were updated last */
20021         do {
20022 -               seq = read_seqbegin(&jiffies_lock);
20023 +               seq = read_seqcount_begin(&jiffies_seq);
20024                 basemono = last_jiffies_update.tv64;
20025                 basejiff = jiffies;
20026 -       } while (read_seqretry(&jiffies_lock, seq));
20027 +       } while (read_seqcount_retry(&jiffies_seq, seq));
20028         ts->last_jiffies = basejiff;
20029
20030         if (rcu_needs_cpu(basemono, &next_rcu) ||
20031 @@ -874,14 +880,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
20032                 return false;
20033
20034         if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
20035 -               static int ratelimit;
20036 -
20037 -               if (ratelimit < 10 &&
20038 -                   (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
20039 -                       pr_warn("NOHZ: local_softirq_pending %02x\n",
20040 -                               (unsigned int) local_softirq_pending());
20041 -                       ratelimit++;
20042 -               }
20043 +               softirq_check_pending_idle();
20044                 return false;
20045         }
20046
20047 @@ -1190,6 +1189,7 @@ void tick_setup_sched_timer(void)
20048          * Emulate tick processing via per-CPU hrtimers:
20049          */
20050         hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
20051 +       ts->sched_timer.irqsafe = 1;
20052         ts->sched_timer.function = tick_sched_timer;
20053
20054         /* Get the next period (per-CPU) */
20055 diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
20056 index 37dec7e3db43..fa8d90d2acc3 100644
20057 --- a/kernel/time/timekeeping.c
20058 +++ b/kernel/time/timekeeping.c
20059 @@ -2328,8 +2328,10 @@ EXPORT_SYMBOL(hardpps);
20060   */
20061  void xtime_update(unsigned long ticks)
20062  {
20063 -       write_seqlock(&jiffies_lock);
20064 +       raw_spin_lock(&jiffies_lock);
20065 +       write_seqcount_begin(&jiffies_seq);
20066         do_timer(ticks);
20067 -       write_sequnlock(&jiffies_lock);
20068 +       write_seqcount_end(&jiffies_seq);
20069 +       raw_spin_unlock(&jiffies_lock);
20070         update_wall_time();
20071  }
20072 diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
20073 index 704f595ce83f..763a3e5121ff 100644
20074 --- a/kernel/time/timekeeping.h
20075 +++ b/kernel/time/timekeeping.h
20076 @@ -19,7 +19,8 @@ extern void timekeeping_resume(void);
20077  extern void do_timer(unsigned long ticks);
20078  extern void update_wall_time(void);
20079
20080 -extern seqlock_t jiffies_lock;
20081 +extern raw_spinlock_t jiffies_lock;
20082 +extern seqcount_t jiffies_seq;
20083
20084  #define CS_NAME_LEN    32
20085
20086 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
20087 index 32bf6f75a8fe..ba53447a03f5 100644
20088 --- a/kernel/time/timer.c
20089 +++ b/kernel/time/timer.c
20090 @@ -193,8 +193,11 @@ EXPORT_SYMBOL(jiffies_64);
20091  #endif
20092
20093  struct timer_base {
20094 -       spinlock_t              lock;
20095 +       raw_spinlock_t          lock;
20096         struct timer_list       *running_timer;
20097 +#ifdef CONFIG_PREEMPT_RT_FULL
20098 +       struct swait_queue_head wait_for_running_timer;
20099 +#endif
20100         unsigned long           clk;
20101         unsigned long           next_expiry;
20102         unsigned int            cpu;
20103 @@ -947,10 +950,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
20104
20105                 if (!(tf & TIMER_MIGRATING)) {
20106                         base = get_timer_base(tf);
20107 -                       spin_lock_irqsave(&base->lock, *flags);
20108 +                       raw_spin_lock_irqsave(&base->lock, *flags);
20109                         if (timer->flags == tf)
20110                                 return base;
20111 -                       spin_unlock_irqrestore(&base->lock, *flags);
20112 +                       raw_spin_unlock_irqrestore(&base->lock, *flags);
20113                 }
20114                 cpu_relax();
20115         }
20116 @@ -1017,9 +1020,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
20117                         /* See the comment in lock_timer_base() */
20118                         timer->flags |= TIMER_MIGRATING;
20119
20120 -                       spin_unlock(&base->lock);
20121 +                       raw_spin_unlock(&base->lock);
20122                         base = new_base;
20123 -                       spin_lock(&base->lock);
20124 +                       raw_spin_lock(&base->lock);
20125                         WRITE_ONCE(timer->flags,
20126                                    (timer->flags & ~TIMER_BASEMASK) | base->cpu);
20127                 }
20128 @@ -1040,7 +1043,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
20129         }
20130
20131  out_unlock:
20132 -       spin_unlock_irqrestore(&base->lock, flags);
20133 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20134
20135         return ret;
20136  }
20137 @@ -1134,19 +1137,46 @@ void add_timer_on(struct timer_list *timer, int cpu)
20138         if (base != new_base) {
20139                 timer->flags |= TIMER_MIGRATING;
20140
20141 -               spin_unlock(&base->lock);
20142 +               raw_spin_unlock(&base->lock);
20143                 base = new_base;
20144 -               spin_lock(&base->lock);
20145 +               raw_spin_lock(&base->lock);
20146                 WRITE_ONCE(timer->flags,
20147                            (timer->flags & ~TIMER_BASEMASK) | cpu);
20148         }
20149
20150         debug_activate(timer, timer->expires);
20151         internal_add_timer(base, timer);
20152 -       spin_unlock_irqrestore(&base->lock, flags);
20153 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20154  }
20155  EXPORT_SYMBOL_GPL(add_timer_on);
20156
20157 +#ifdef CONFIG_PREEMPT_RT_FULL
20158 +/*
20159 + * Wait for a running timer
20160 + */
20161 +static void wait_for_running_timer(struct timer_list *timer)
20162 +{
20163 +       struct timer_base *base;
20164 +       u32 tf = timer->flags;
20165 +
20166 +       if (tf & TIMER_MIGRATING)
20167 +               return;
20168 +
20169 +       base = get_timer_base(tf);
20170 +       swait_event(base->wait_for_running_timer,
20171 +                  base->running_timer != timer);
20172 +}
20173 +
20174 +# define wakeup_timer_waiters(b)       swake_up_all(&(b)->wait_for_running_timer)
20175 +#else
20176 +static inline void wait_for_running_timer(struct timer_list *timer)
20177 +{
20178 +       cpu_relax();
20179 +}
20180 +
20181 +# define wakeup_timer_waiters(b)       do { } while (0)
20182 +#endif
20183 +
20184  /**
20185   * del_timer - deactive a timer.
20186   * @timer: the timer to be deactivated
20187 @@ -1170,7 +1200,7 @@ int del_timer(struct timer_list *timer)
20188         if (timer_pending(timer)) {
20189                 base = lock_timer_base(timer, &flags);
20190                 ret = detach_if_pending(timer, base, true);
20191 -               spin_unlock_irqrestore(&base->lock, flags);
20192 +               raw_spin_unlock_irqrestore(&base->lock, flags);
20193         }
20194
20195         return ret;
20196 @@ -1198,13 +1228,13 @@ int try_to_del_timer_sync(struct timer_list *timer)
20197                 timer_stats_timer_clear_start_info(timer);
20198                 ret = detach_if_pending(timer, base, true);
20199         }
20200 -       spin_unlock_irqrestore(&base->lock, flags);
20201 +       raw_spin_unlock_irqrestore(&base->lock, flags);
20202
20203         return ret;
20204  }
20205  EXPORT_SYMBOL(try_to_del_timer_sync);
20206
20207 -#ifdef CONFIG_SMP
20208 +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
20209  /**
20210   * del_timer_sync - deactivate a timer and wait for the handler to finish.
20211   * @timer: the timer to be deactivated
20212 @@ -1264,7 +1294,7 @@ int del_timer_sync(struct timer_list *timer)
20213                 int ret = try_to_del_timer_sync(timer);
20214                 if (ret >= 0)
20215                         return ret;
20216 -               cpu_relax();
20217 +               wait_for_running_timer(timer);
20218         }
20219  }
20220  EXPORT_SYMBOL(del_timer_sync);
20221 @@ -1329,14 +1359,17 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
20222                 fn = timer->function;
20223                 data = timer->data;
20224
20225 -               if (timer->flags & TIMER_IRQSAFE) {
20226 -                       spin_unlock(&base->lock);
20227 +               if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) &&
20228 +                   timer->flags & TIMER_IRQSAFE) {
20229 +                       raw_spin_unlock(&base->lock);
20230                         call_timer_fn(timer, fn, data);
20231 -                       spin_lock(&base->lock);
20232 +                       base->running_timer = NULL;
20233 +                       raw_spin_lock(&base->lock);
20234                 } else {
20235 -                       spin_unlock_irq(&base->lock);
20236 +                       raw_spin_unlock_irq(&base->lock);
20237                         call_timer_fn(timer, fn, data);
20238 -                       spin_lock_irq(&base->lock);
20239 +                       base->running_timer = NULL;
20240 +                       raw_spin_lock_irq(&base->lock);
20241                 }
20242         }
20243  }
20244 @@ -1505,7 +1538,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
20245         if (cpu_is_offline(smp_processor_id()))
20246                 return expires;
20247
20248 -       spin_lock(&base->lock);
20249 +       raw_spin_lock(&base->lock);
20250         nextevt = __next_timer_interrupt(base);
20251         is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
20252         base->next_expiry = nextevt;
20253 @@ -1529,7 +1562,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
20254                 if ((expires - basem) > TICK_NSEC)
20255                         base->is_idle = true;
20256         }
20257 -       spin_unlock(&base->lock);
20258 +       raw_spin_unlock(&base->lock);
20259
20260         return cmp_next_hrtimer_event(basem, expires);
20261  }
20262 @@ -1594,13 +1627,13 @@ void update_process_times(int user_tick)
20263
20264         /* Note: this timer irq context must be accounted for as well. */
20265         account_process_tick(p, user_tick);
20266 +       scheduler_tick();
20267         run_local_timers();
20268         rcu_check_callbacks(user_tick);
20269 -#ifdef CONFIG_IRQ_WORK
20270 +#if defined(CONFIG_IRQ_WORK)
20271         if (in_irq())
20272                 irq_work_tick();
20273  #endif
20274 -       scheduler_tick();
20275         run_posix_cpu_timers(p);
20276  }
20277
20278 @@ -1616,7 +1649,7 @@ static inline void __run_timers(struct timer_base *base)
20279         if (!time_after_eq(jiffies, base->clk))
20280                 return;
20281
20282 -       spin_lock_irq(&base->lock);
20283 +       raw_spin_lock_irq(&base->lock);
20284
20285         while (time_after_eq(jiffies, base->clk)) {
20286
20287 @@ -1626,8 +1659,8 @@ static inline void __run_timers(struct timer_base *base)
20288                 while (levels--)
20289                         expire_timers(base, heads + levels);
20290         }
20291 -       base->running_timer = NULL;
20292 -       spin_unlock_irq(&base->lock);
20293 +       raw_spin_unlock_irq(&base->lock);
20294 +       wakeup_timer_waiters(base);
20295  }
20296
20297  /*
20298 @@ -1637,6 +1670,8 @@ static void run_timer_softirq(struct softirq_action *h)
20299  {
20300         struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
20301
20302 +       irq_work_tick_soft();
20303 +
20304         __run_timers(base);
20305         if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
20306                 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
20307 @@ -1822,16 +1857,16 @@ int timers_dead_cpu(unsigned int cpu)
20308                  * The caller is globally serialized and nobody else
20309                  * takes two locks at once, deadlock is not possible.
20310                  */
20311 -               spin_lock_irq(&new_base->lock);
20312 -               spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
20313 +               raw_spin_lock_irq(&new_base->lock);
20314 +               raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
20315
20316                 BUG_ON(old_base->running_timer);
20317
20318                 for (i = 0; i < WHEEL_SIZE; i++)
20319                         migrate_timer_list(new_base, old_base->vectors + i);
20320
20321 -               spin_unlock(&old_base->lock);
20322 -               spin_unlock_irq(&new_base->lock);
20323 +               raw_spin_unlock(&old_base->lock);
20324 +               raw_spin_unlock_irq(&new_base->lock);
20325                 put_cpu_ptr(&timer_bases);
20326         }
20327         return 0;
20328 @@ -1847,8 +1882,11 @@ static void __init init_timer_cpu(int cpu)
20329         for (i = 0; i < NR_BASES; i++) {
20330                 base = per_cpu_ptr(&timer_bases[i], cpu);
20331                 base->cpu = cpu;
20332 -               spin_lock_init(&base->lock);
20333 +               raw_spin_lock_init(&base->lock);
20334                 base->clk = jiffies;
20335 +#ifdef CONFIG_PREEMPT_RT_FULL
20336 +               init_swait_queue_head(&base->wait_for_running_timer);
20337 +#endif
20338         }
20339  }
20340
20341 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
20342 index f4b86e8ca1e7..340f14eef24a 100644
20343 --- a/kernel/trace/Kconfig
20344 +++ b/kernel/trace/Kconfig
20345 @@ -187,6 +187,24 @@ config IRQSOFF_TRACER
20346           enabled. This option and the preempt-off timing option can be
20347           used together or separately.)
20348
20349 +config INTERRUPT_OFF_HIST
20350 +       bool "Interrupts-off Latency Histogram"
20351 +       depends on IRQSOFF_TRACER
20352 +       help
20353 +         This option generates continuously updated histograms (one per cpu)
20354 +         of the duration of time periods with interrupts disabled. The
20355 +         histograms are disabled by default. To enable them, write a non-zero
20356 +         number to
20357 +
20358 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
20359 +
20360 +         If PREEMPT_OFF_HIST is also selected, additional histograms (one
20361 +         per cpu) are generated that accumulate the duration of time periods
20362 +         when both interrupts and preemption are disabled. The histogram data
20363 +         will be located in the debug file system at
20364 +
20365 +             /sys/kernel/debug/tracing/latency_hist/irqsoff
20366 +
20367  config PREEMPT_TRACER
20368         bool "Preemption-off Latency Tracer"
20369         default n
20370 @@ -197,6 +215,7 @@ config PREEMPT_TRACER
20371         select RING_BUFFER_ALLOW_SWAP
20372         select TRACER_SNAPSHOT
20373         select TRACER_SNAPSHOT_PER_CPU_SWAP
20374 +       select USING_GET_LOCK_PARENT_IP
20375         help
20376           This option measures the time spent in preemption-off critical
20377           sections, with microsecond accuracy.
20378 @@ -211,6 +230,24 @@ config PREEMPT_TRACER
20379           enabled. This option and the irqs-off timing option can be
20380           used together or separately.)
20381
20382 +config PREEMPT_OFF_HIST
20383 +       bool "Preemption-off Latency Histogram"
20384 +       depends on PREEMPT_TRACER
20385 +       help
20386 +         This option generates continuously updated histograms (one per cpu)
20387 +         of the duration of time periods with preemption disabled. The
20388 +         histograms are disabled by default. To enable them, write a non-zero
20389 +         number to
20390 +
20391 +             /sys/kernel/debug/tracing/latency_hist/enable/preemptirqsoff
20392 +
20393 +         If INTERRUPT_OFF_HIST is also selected, additional histograms (one
20394 +         per cpu) are generated that accumulate the duration of time periods
20395 +         when both interrupts and preemption are disabled. The histogram data
20396 +         will be located in the debug file system at
20397 +
20398 +             /sys/kernel/debug/tracing/latency_hist/preemptoff
20399 +
20400  config SCHED_TRACER
20401         bool "Scheduling Latency Tracer"
20402         select GENERIC_TRACER
20403 @@ -221,6 +258,74 @@ config SCHED_TRACER
20404           This tracer tracks the latency of the highest priority task
20405           to be scheduled in, starting from the point it has woken up.
20406
20407 +config WAKEUP_LATENCY_HIST
20408 +       bool "Scheduling Latency Histogram"
20409 +       depends on SCHED_TRACER
20410 +       help
20411 +         This option generates continuously updated histograms (one per cpu)
20412 +         of the scheduling latency of the highest priority task.
20413 +         The histograms are disabled by default. To enable them, write a
20414 +         non-zero number to
20415 +
20416 +             /sys/kernel/debug/tracing/latency_hist/enable/wakeup
20417 +
20418 +         Two different algorithms are used, one to determine the latency of
20419 +         processes that exclusively use the highest priority of the system and
20420 +         another one to determine the latency of processes that share the
20421 +         highest system priority with other processes. The former is used to
20422 +         improve hardware and system software, the latter to optimize the
20423 +         priority design of a given system. The histogram data will be
20424 +         located in the debug file system at
20425 +
20426 +             /sys/kernel/debug/tracing/latency_hist/wakeup
20427 +
20428 +         and
20429 +
20430 +             /sys/kernel/debug/tracing/latency_hist/wakeup/sharedprio
20431 +
20432 +         If both Scheduling Latency Histogram and Missed Timer Offsets
20433 +         Histogram are selected, additional histogram data will be collected
20434 +         that contain, in addition to the wakeup latency, the timer latency, in
20435 +         case the wakeup was triggered by an expired timer. These histograms
20436 +         are available in the
20437 +
20438 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
20439 +
20440 +         directory. They reflect the apparent interrupt and scheduling latency
20441 +         and are best suitable to determine the worst-case latency of a given
20442 +         system. To enable these histograms, write a non-zero number to
20443 +
20444 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
20445 +
20446 +config MISSED_TIMER_OFFSETS_HIST
20447 +       depends on HIGH_RES_TIMERS
20448 +       select GENERIC_TRACER
20449 +       bool "Missed Timer Offsets Histogram"
20450 +       help
20451 +         Generate a histogram of missed timer offsets in microseconds. The
20452 +         histograms are disabled by default. To enable them, write a non-zero
20453 +         number to
20454 +
20455 +             /sys/kernel/debug/tracing/latency_hist/enable/missed_timer_offsets
20456 +
20457 +         The histogram data will be located in the debug file system at
20458 +
20459 +             /sys/kernel/debug/tracing/latency_hist/missed_timer_offsets
20460 +
20461 +         If both Scheduling Latency Histogram and Missed Timer Offsets
20462 +         Histogram are selected, additional histogram data will be collected
20463 +         that contain, in addition to the wakeup latency, the timer latency, in
20464 +         case the wakeup was triggered by an expired timer. These histograms
20465 +         are available in the
20466 +
20467 +             /sys/kernel/debug/tracing/latency_hist/timerandwakeup
20468 +
20469 +         directory. They reflect the apparent interrupt and scheduling latency
20470 +         and are best suitable to determine the worst-case latency of a given
20471 +         system. To enable these histograms, write a non-zero number to
20472 +
20473 +             /sys/kernel/debug/tracing/latency_hist/enable/timerandwakeup
20474 +
20475  config ENABLE_DEFAULT_TRACERS
20476         bool "Trace process context switches and events"
20477         depends on !GENERIC_TRACER
20478 diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
20479 index d0a1617b52b4..6bf9e9ff1fa5 100644
20480 --- a/kernel/trace/Makefile
20481 +++ b/kernel/trace/Makefile
20482 @@ -41,6 +41,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
20483  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20484  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
20485  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
20486 +obj-$(CONFIG_INTERRUPT_OFF_HIST) += latency_hist.o
20487 +obj-$(CONFIG_PREEMPT_OFF_HIST) += latency_hist.o
20488 +obj-$(CONFIG_WAKEUP_LATENCY_HIST) += latency_hist.o
20489 +obj-$(CONFIG_MISSED_TIMER_OFFSETS_HIST) += latency_hist.o
20490  obj-$(CONFIG_NOP_TRACER) += trace_nop.o
20491  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
20492  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
20493 diff --git a/kernel/trace/latency_hist.c b/kernel/trace/latency_hist.c
20494 new file mode 100644
20495 index 000000000000..7f6ee70dea41
20496 --- /dev/null
20497 +++ b/kernel/trace/latency_hist.c
20498 @@ -0,0 +1,1178 @@
20499 +/*
20500 + * kernel/trace/latency_hist.c
20501 + *
20502 + * Add support for histograms of preemption-off latency and
20503 + * interrupt-off latency and wakeup latency, it depends on
20504 + * Real-Time Preemption Support.
20505 + *
20506 + *  Copyright (C) 2005 MontaVista Software, Inc.
20507 + *  Yi Yang <yyang@ch.mvista.com>
20508 + *
20509 + *  Converted to work with the new latency tracer.
20510 + *  Copyright (C) 2008 Red Hat, Inc.
20511 + *    Steven Rostedt <srostedt@redhat.com>
20512 + *
20513 + */
20514 +#include <linux/module.h>
20515 +#include <linux/debugfs.h>
20516 +#include <linux/seq_file.h>
20517 +#include <linux/percpu.h>
20518 +#include <linux/kallsyms.h>
20519 +#include <linux/uaccess.h>
20520 +#include <linux/sched.h>
20521 +#include <linux/sched/rt.h>
20522 +#include <linux/slab.h>
20523 +#include <linux/atomic.h>
20524 +#include <asm/div64.h>
20525 +
20526 +#include "trace.h"
20527 +#include <trace/events/sched.h>
20528 +
20529 +#define NSECS_PER_USECS 1000L
20530 +
20531 +#define CREATE_TRACE_POINTS
20532 +#include <trace/events/hist.h>
20533 +
20534 +enum {
20535 +       IRQSOFF_LATENCY = 0,
20536 +       PREEMPTOFF_LATENCY,
20537 +       PREEMPTIRQSOFF_LATENCY,
20538 +       WAKEUP_LATENCY,
20539 +       WAKEUP_LATENCY_SHAREDPRIO,
20540 +       MISSED_TIMER_OFFSETS,
20541 +       TIMERANDWAKEUP_LATENCY,
20542 +       MAX_LATENCY_TYPE,
20543 +};
20544 +
20545 +#define MAX_ENTRY_NUM 10240
20546 +
20547 +struct hist_data {
20548 +       atomic_t hist_mode; /* 0 log, 1 don't log */
20549 +       long offset; /* set it to MAX_ENTRY_NUM/2 for a bipolar scale */
20550 +       long min_lat;
20551 +       long max_lat;
20552 +       unsigned long long below_hist_bound_samples;
20553 +       unsigned long long above_hist_bound_samples;
20554 +       long long accumulate_lat;
20555 +       unsigned long long total_samples;
20556 +       unsigned long long hist_array[MAX_ENTRY_NUM];
20557 +};
20558 +
20559 +struct enable_data {
20560 +       int latency_type;
20561 +       int enabled;
20562 +};
20563 +
20564 +static char *latency_hist_dir_root = "latency_hist";
20565 +
20566 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20567 +static DEFINE_PER_CPU(struct hist_data, irqsoff_hist);
20568 +static char *irqsoff_hist_dir = "irqsoff";
20569 +static DEFINE_PER_CPU(cycles_t, hist_irqsoff_start);
20570 +static DEFINE_PER_CPU(int, hist_irqsoff_counting);
20571 +#endif
20572 +
20573 +#ifdef CONFIG_PREEMPT_OFF_HIST
20574 +static DEFINE_PER_CPU(struct hist_data, preemptoff_hist);
20575 +static char *preemptoff_hist_dir = "preemptoff";
20576 +static DEFINE_PER_CPU(cycles_t, hist_preemptoff_start);
20577 +static DEFINE_PER_CPU(int, hist_preemptoff_counting);
20578 +#endif
20579 +
20580 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
20581 +static DEFINE_PER_CPU(struct hist_data, preemptirqsoff_hist);
20582 +static char *preemptirqsoff_hist_dir = "preemptirqsoff";
20583 +static DEFINE_PER_CPU(cycles_t, hist_preemptirqsoff_start);
20584 +static DEFINE_PER_CPU(int, hist_preemptirqsoff_counting);
20585 +#endif
20586 +
20587 +#if defined(CONFIG_PREEMPT_OFF_HIST) || defined(CONFIG_INTERRUPT_OFF_HIST)
20588 +static notrace void probe_preemptirqsoff_hist(void *v, int reason, int start);
20589 +static struct enable_data preemptirqsoff_enabled_data = {
20590 +       .latency_type = PREEMPTIRQSOFF_LATENCY,
20591 +       .enabled = 0,
20592 +};
20593 +#endif
20594 +
20595 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20596 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20597 +struct maxlatproc_data {
20598 +       char comm[FIELD_SIZEOF(struct task_struct, comm)];
20599 +       char current_comm[FIELD_SIZEOF(struct task_struct, comm)];
20600 +       int pid;
20601 +       int current_pid;
20602 +       int prio;
20603 +       int current_prio;
20604 +       long latency;
20605 +       long timeroffset;
20606 +       cycle_t timestamp;
20607 +};
20608 +#endif
20609 +
20610 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20611 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist);
20612 +static DEFINE_PER_CPU(struct hist_data, wakeup_latency_hist_sharedprio);
20613 +static char *wakeup_latency_hist_dir = "wakeup";
20614 +static char *wakeup_latency_hist_dir_sharedprio = "sharedprio";
20615 +static notrace void probe_wakeup_latency_hist_start(void *v,
20616 +       struct task_struct *p);
20617 +static notrace void probe_wakeup_latency_hist_stop(void *v,
20618 +       bool preempt, struct task_struct *prev, struct task_struct *next);
20619 +static notrace void probe_sched_migrate_task(void *,
20620 +       struct task_struct *task, int cpu);
20621 +static struct enable_data wakeup_latency_enabled_data = {
20622 +       .latency_type = WAKEUP_LATENCY,
20623 +       .enabled = 0,
20624 +};
20625 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc);
20626 +static DEFINE_PER_CPU(struct maxlatproc_data, wakeup_maxlatproc_sharedprio);
20627 +static DEFINE_PER_CPU(struct task_struct *, wakeup_task);
20628 +static DEFINE_PER_CPU(int, wakeup_sharedprio);
20629 +static unsigned long wakeup_pid;
20630 +#endif
20631 +
20632 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20633 +static DEFINE_PER_CPU(struct hist_data, missed_timer_offsets);
20634 +static char *missed_timer_offsets_dir = "missed_timer_offsets";
20635 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
20636 +       long long offset, struct task_struct *curr, struct task_struct *task);
20637 +static struct enable_data missed_timer_offsets_enabled_data = {
20638 +       .latency_type = MISSED_TIMER_OFFSETS,
20639 +       .enabled = 0,
20640 +};
20641 +static DEFINE_PER_CPU(struct maxlatproc_data, missed_timer_offsets_maxlatproc);
20642 +static unsigned long missed_timer_offsets_pid;
20643 +#endif
20644 +
20645 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20646 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20647 +static DEFINE_PER_CPU(struct hist_data, timerandwakeup_latency_hist);
20648 +static char *timerandwakeup_latency_hist_dir = "timerandwakeup";
20649 +static struct enable_data timerandwakeup_enabled_data = {
20650 +       .latency_type = TIMERANDWAKEUP_LATENCY,
20651 +       .enabled = 0,
20652 +};
20653 +static DEFINE_PER_CPU(struct maxlatproc_data, timerandwakeup_maxlatproc);
20654 +#endif
20655 +
20656 +void notrace latency_hist(int latency_type, int cpu, long latency,
20657 +                         long timeroffset, cycle_t stop,
20658 +                         struct task_struct *p)
20659 +{
20660 +       struct hist_data *my_hist;
20661 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20662 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20663 +       struct maxlatproc_data *mp = NULL;
20664 +#endif
20665 +
20666 +       if (!cpu_possible(cpu) || latency_type < 0 ||
20667 +           latency_type >= MAX_LATENCY_TYPE)
20668 +               return;
20669 +
20670 +       switch (latency_type) {
20671 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20672 +       case IRQSOFF_LATENCY:
20673 +               my_hist = &per_cpu(irqsoff_hist, cpu);
20674 +               break;
20675 +#endif
20676 +#ifdef CONFIG_PREEMPT_OFF_HIST
20677 +       case PREEMPTOFF_LATENCY:
20678 +               my_hist = &per_cpu(preemptoff_hist, cpu);
20679 +               break;
20680 +#endif
20681 +#if defined(CONFIG_PREEMPT_OFF_HIST) && defined(CONFIG_INTERRUPT_OFF_HIST)
20682 +       case PREEMPTIRQSOFF_LATENCY:
20683 +               my_hist = &per_cpu(preemptirqsoff_hist, cpu);
20684 +               break;
20685 +#endif
20686 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20687 +       case WAKEUP_LATENCY:
20688 +               my_hist = &per_cpu(wakeup_latency_hist, cpu);
20689 +               mp = &per_cpu(wakeup_maxlatproc, cpu);
20690 +               break;
20691 +       case WAKEUP_LATENCY_SHAREDPRIO:
20692 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
20693 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
20694 +               break;
20695 +#endif
20696 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20697 +       case MISSED_TIMER_OFFSETS:
20698 +               my_hist = &per_cpu(missed_timer_offsets, cpu);
20699 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
20700 +               break;
20701 +#endif
20702 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20703 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20704 +       case TIMERANDWAKEUP_LATENCY:
20705 +               my_hist = &per_cpu(timerandwakeup_latency_hist, cpu);
20706 +               mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
20707 +               break;
20708 +#endif
20709 +
20710 +       default:
20711 +               return;
20712 +       }
20713 +
20714 +       latency += my_hist->offset;
20715 +
20716 +       if (atomic_read(&my_hist->hist_mode) == 0)
20717 +               return;
20718 +
20719 +       if (latency < 0 || latency >= MAX_ENTRY_NUM) {
20720 +               if (latency < 0)
20721 +                       my_hist->below_hist_bound_samples++;
20722 +               else
20723 +                       my_hist->above_hist_bound_samples++;
20724 +       } else
20725 +               my_hist->hist_array[latency]++;
20726 +
20727 +       if (unlikely(latency > my_hist->max_lat ||
20728 +           my_hist->min_lat == LONG_MAX)) {
20729 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20730 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20731 +               if (latency_type == WAKEUP_LATENCY ||
20732 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
20733 +                   latency_type == MISSED_TIMER_OFFSETS ||
20734 +                   latency_type == TIMERANDWAKEUP_LATENCY) {
20735 +                       strncpy(mp->comm, p->comm, sizeof(mp->comm));
20736 +                       strncpy(mp->current_comm, current->comm,
20737 +                           sizeof(mp->current_comm));
20738 +                       mp->pid = task_pid_nr(p);
20739 +                       mp->current_pid = task_pid_nr(current);
20740 +                       mp->prio = p->prio;
20741 +                       mp->current_prio = current->prio;
20742 +                       mp->latency = latency;
20743 +                       mp->timeroffset = timeroffset;
20744 +                       mp->timestamp = stop;
20745 +               }
20746 +#endif
20747 +               my_hist->max_lat = latency;
20748 +       }
20749 +       if (unlikely(latency < my_hist->min_lat))
20750 +               my_hist->min_lat = latency;
20751 +       my_hist->total_samples++;
20752 +       my_hist->accumulate_lat += latency;
20753 +}
20754 +
20755 +static void *l_start(struct seq_file *m, loff_t *pos)
20756 +{
20757 +       loff_t *index_ptr = NULL;
20758 +       loff_t index = *pos;
20759 +       struct hist_data *my_hist = m->private;
20760 +
20761 +       if (index == 0) {
20762 +               char minstr[32], avgstr[32], maxstr[32];
20763 +
20764 +               atomic_dec(&my_hist->hist_mode);
20765 +
20766 +               if (likely(my_hist->total_samples)) {
20767 +                       long avg = (long) div64_s64(my_hist->accumulate_lat,
20768 +                           my_hist->total_samples);
20769 +                       snprintf(minstr, sizeof(minstr), "%ld",
20770 +                           my_hist->min_lat - my_hist->offset);
20771 +                       snprintf(avgstr, sizeof(avgstr), "%ld",
20772 +                           avg - my_hist->offset);
20773 +                       snprintf(maxstr, sizeof(maxstr), "%ld",
20774 +                           my_hist->max_lat - my_hist->offset);
20775 +               } else {
20776 +                       strcpy(minstr, "<undef>");
20777 +                       strcpy(avgstr, minstr);
20778 +                       strcpy(maxstr, minstr);
20779 +               }
20780 +
20781 +               seq_printf(m, "#Minimum latency: %s microseconds\n"
20782 +                          "#Average latency: %s microseconds\n"
20783 +                          "#Maximum latency: %s microseconds\n"
20784 +                          "#Total samples: %llu\n"
20785 +                          "#There are %llu samples lower than %ld"
20786 +                          " microseconds.\n"
20787 +                          "#There are %llu samples greater or equal"
20788 +                          " than %ld microseconds.\n"
20789 +                          "#usecs\t%16s\n",
20790 +                          minstr, avgstr, maxstr,
20791 +                          my_hist->total_samples,
20792 +                          my_hist->below_hist_bound_samples,
20793 +                          -my_hist->offset,
20794 +                          my_hist->above_hist_bound_samples,
20795 +                          MAX_ENTRY_NUM - my_hist->offset,
20796 +                          "samples");
20797 +       }
20798 +       if (index < MAX_ENTRY_NUM) {
20799 +               index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
20800 +               if (index_ptr)
20801 +                       *index_ptr = index;
20802 +       }
20803 +
20804 +       return index_ptr;
20805 +}
20806 +
20807 +static void *l_next(struct seq_file *m, void *p, loff_t *pos)
20808 +{
20809 +       loff_t *index_ptr = p;
20810 +       struct hist_data *my_hist = m->private;
20811 +
20812 +       if (++*pos >= MAX_ENTRY_NUM) {
20813 +               atomic_inc(&my_hist->hist_mode);
20814 +               return NULL;
20815 +       }
20816 +       *index_ptr = *pos;
20817 +       return index_ptr;
20818 +}
20819 +
20820 +static void l_stop(struct seq_file *m, void *p)
20821 +{
20822 +       kfree(p);
20823 +}
20824 +
20825 +static int l_show(struct seq_file *m, void *p)
20826 +{
20827 +       int index = *(loff_t *) p;
20828 +       struct hist_data *my_hist = m->private;
20829 +
20830 +       seq_printf(m, "%6ld\t%16llu\n", index - my_hist->offset,
20831 +           my_hist->hist_array[index]);
20832 +       return 0;
20833 +}
20834 +
20835 +static const struct seq_operations latency_hist_seq_op = {
20836 +       .start = l_start,
20837 +       .next  = l_next,
20838 +       .stop  = l_stop,
20839 +       .show  = l_show
20840 +};
20841 +
20842 +static int latency_hist_open(struct inode *inode, struct file *file)
20843 +{
20844 +       int ret;
20845 +
20846 +       ret = seq_open(file, &latency_hist_seq_op);
20847 +       if (!ret) {
20848 +               struct seq_file *seq = file->private_data;
20849 +               seq->private = inode->i_private;
20850 +       }
20851 +       return ret;
20852 +}
20853 +
20854 +static const struct file_operations latency_hist_fops = {
20855 +       .open = latency_hist_open,
20856 +       .read = seq_read,
20857 +       .llseek = seq_lseek,
20858 +       .release = seq_release,
20859 +};
20860 +
20861 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20862 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20863 +static void clear_maxlatprocdata(struct maxlatproc_data *mp)
20864 +{
20865 +       mp->comm[0] = mp->current_comm[0] = '\0';
20866 +       mp->prio = mp->current_prio = mp->pid = mp->current_pid =
20867 +           mp->latency = mp->timeroffset = -1;
20868 +       mp->timestamp = 0;
20869 +}
20870 +#endif
20871 +
20872 +static void hist_reset(struct hist_data *hist)
20873 +{
20874 +       atomic_dec(&hist->hist_mode);
20875 +
20876 +       memset(hist->hist_array, 0, sizeof(hist->hist_array));
20877 +       hist->below_hist_bound_samples = 0ULL;
20878 +       hist->above_hist_bound_samples = 0ULL;
20879 +       hist->min_lat = LONG_MAX;
20880 +       hist->max_lat = LONG_MIN;
20881 +       hist->total_samples = 0ULL;
20882 +       hist->accumulate_lat = 0LL;
20883 +
20884 +       atomic_inc(&hist->hist_mode);
20885 +}
20886 +
20887 +static ssize_t
20888 +latency_hist_reset(struct file *file, const char __user *a,
20889 +                  size_t size, loff_t *off)
20890 +{
20891 +       int cpu;
20892 +       struct hist_data *hist = NULL;
20893 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20894 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20895 +       struct maxlatproc_data *mp = NULL;
20896 +#endif
20897 +       off_t latency_type = (off_t) file->private_data;
20898 +
20899 +       for_each_online_cpu(cpu) {
20900 +
20901 +               switch (latency_type) {
20902 +#ifdef CONFIG_PREEMPT_OFF_HIST
20903 +               case PREEMPTOFF_LATENCY:
20904 +                       hist = &per_cpu(preemptoff_hist, cpu);
20905 +                       break;
20906 +#endif
20907 +#ifdef CONFIG_INTERRUPT_OFF_HIST
20908 +               case IRQSOFF_LATENCY:
20909 +                       hist = &per_cpu(irqsoff_hist, cpu);
20910 +                       break;
20911 +#endif
20912 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
20913 +               case PREEMPTIRQSOFF_LATENCY:
20914 +                       hist = &per_cpu(preemptirqsoff_hist, cpu);
20915 +                       break;
20916 +#endif
20917 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
20918 +               case WAKEUP_LATENCY:
20919 +                       hist = &per_cpu(wakeup_latency_hist, cpu);
20920 +                       mp = &per_cpu(wakeup_maxlatproc, cpu);
20921 +                       break;
20922 +               case WAKEUP_LATENCY_SHAREDPRIO:
20923 +                       hist = &per_cpu(wakeup_latency_hist_sharedprio, cpu);
20924 +                       mp = &per_cpu(wakeup_maxlatproc_sharedprio, cpu);
20925 +                       break;
20926 +#endif
20927 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
20928 +               case MISSED_TIMER_OFFSETS:
20929 +                       hist = &per_cpu(missed_timer_offsets, cpu);
20930 +                       mp = &per_cpu(missed_timer_offsets_maxlatproc, cpu);
20931 +                       break;
20932 +#endif
20933 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
20934 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20935 +               case TIMERANDWAKEUP_LATENCY:
20936 +                       hist = &per_cpu(timerandwakeup_latency_hist, cpu);
20937 +                       mp = &per_cpu(timerandwakeup_maxlatproc, cpu);
20938 +                       break;
20939 +#endif
20940 +               }
20941 +
20942 +               hist_reset(hist);
20943 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20944 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20945 +               if (latency_type == WAKEUP_LATENCY ||
20946 +                   latency_type == WAKEUP_LATENCY_SHAREDPRIO ||
20947 +                   latency_type == MISSED_TIMER_OFFSETS ||
20948 +                   latency_type == TIMERANDWAKEUP_LATENCY)
20949 +                       clear_maxlatprocdata(mp);
20950 +#endif
20951 +       }
20952 +
20953 +       return size;
20954 +}
20955 +
20956 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20957 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20958 +static ssize_t
20959 +show_pid(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
20960 +{
20961 +       char buf[64];
20962 +       int r;
20963 +       unsigned long *this_pid = file->private_data;
20964 +
20965 +       r = snprintf(buf, sizeof(buf), "%lu\n", *this_pid);
20966 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
20967 +}
20968 +
20969 +static ssize_t do_pid(struct file *file, const char __user *ubuf,
20970 +                     size_t cnt, loff_t *ppos)
20971 +{
20972 +       char buf[64];
20973 +       unsigned long pid;
20974 +       unsigned long *this_pid = file->private_data;
20975 +
20976 +       if (cnt >= sizeof(buf))
20977 +               return -EINVAL;
20978 +
20979 +       if (copy_from_user(&buf, ubuf, cnt))
20980 +               return -EFAULT;
20981 +
20982 +       buf[cnt] = '\0';
20983 +
20984 +       if (kstrtoul(buf, 10, &pid))
20985 +               return -EINVAL;
20986 +
20987 +       *this_pid = pid;
20988 +
20989 +       return cnt;
20990 +}
20991 +#endif
20992 +
20993 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
20994 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
20995 +static ssize_t
20996 +show_maxlatproc(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
20997 +{
20998 +       int r;
20999 +       struct maxlatproc_data *mp = file->private_data;
21000 +       int strmaxlen = (TASK_COMM_LEN * 2) + (8 * 8);
21001 +       unsigned long long t;
21002 +       unsigned long usecs, secs;
21003 +       char *buf;
21004 +
21005 +       if (mp->pid == -1 || mp->current_pid == -1) {
21006 +               buf = "(none)\n";
21007 +               return simple_read_from_buffer(ubuf, cnt, ppos, buf,
21008 +                   strlen(buf));
21009 +       }
21010 +
21011 +       buf = kmalloc(strmaxlen, GFP_KERNEL);
21012 +       if (buf == NULL)
21013 +               return -ENOMEM;
21014 +
21015 +       t = ns2usecs(mp->timestamp);
21016 +       usecs = do_div(t, USEC_PER_SEC);
21017 +       secs = (unsigned long) t;
21018 +       r = snprintf(buf, strmaxlen,
21019 +           "%d %d %ld (%ld) %s <- %d %d %s %lu.%06lu\n", mp->pid,
21020 +           MAX_RT_PRIO-1 - mp->prio, mp->latency, mp->timeroffset, mp->comm,
21021 +           mp->current_pid, MAX_RT_PRIO-1 - mp->current_prio, mp->current_comm,
21022 +           secs, usecs);
21023 +       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21024 +       kfree(buf);
21025 +       return r;
21026 +}
21027 +#endif
21028 +
21029 +static ssize_t
21030 +show_enable(struct file *file, char __user *ubuf, size_t cnt, loff_t *ppos)
21031 +{
21032 +       char buf[64];
21033 +       struct enable_data *ed = file->private_data;
21034 +       int r;
21035 +
21036 +       r = snprintf(buf, sizeof(buf), "%d\n", ed->enabled);
21037 +       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
21038 +}
21039 +
21040 +static ssize_t
21041 +do_enable(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos)
21042 +{
21043 +       char buf[64];
21044 +       long enable;
21045 +       struct enable_data *ed = file->private_data;
21046 +
21047 +       if (cnt >= sizeof(buf))
21048 +               return -EINVAL;
21049 +
21050 +       if (copy_from_user(&buf, ubuf, cnt))
21051 +               return -EFAULT;
21052 +
21053 +       buf[cnt] = 0;
21054 +
21055 +       if (kstrtoul(buf, 10, &enable))
21056 +               return -EINVAL;
21057 +
21058 +       if ((enable && ed->enabled) || (!enable && !ed->enabled))
21059 +               return cnt;
21060 +
21061 +       if (enable) {
21062 +               int ret;
21063 +
21064 +               switch (ed->latency_type) {
21065 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21066 +               case PREEMPTIRQSOFF_LATENCY:
21067 +                       ret = register_trace_preemptirqsoff_hist(
21068 +                           probe_preemptirqsoff_hist, NULL);
21069 +                       if (ret) {
21070 +                               pr_info("wakeup trace: Couldn't assign "
21071 +                                   "probe_preemptirqsoff_hist "
21072 +                                   "to trace_preemptirqsoff_hist\n");
21073 +                               return ret;
21074 +                       }
21075 +                       break;
21076 +#endif
21077 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21078 +               case WAKEUP_LATENCY:
21079 +                       ret = register_trace_sched_wakeup(
21080 +                           probe_wakeup_latency_hist_start, NULL);
21081 +                       if (ret) {
21082 +                               pr_info("wakeup trace: Couldn't assign "
21083 +                                   "probe_wakeup_latency_hist_start "
21084 +                                   "to trace_sched_wakeup\n");
21085 +                               return ret;
21086 +                       }
21087 +                       ret = register_trace_sched_wakeup_new(
21088 +                           probe_wakeup_latency_hist_start, NULL);
21089 +                       if (ret) {
21090 +                               pr_info("wakeup trace: Couldn't assign "
21091 +                                   "probe_wakeup_latency_hist_start "
21092 +                                   "to trace_sched_wakeup_new\n");
21093 +                               unregister_trace_sched_wakeup(
21094 +                                   probe_wakeup_latency_hist_start, NULL);
21095 +                               return ret;
21096 +                       }
21097 +                       ret = register_trace_sched_switch(
21098 +                           probe_wakeup_latency_hist_stop, NULL);
21099 +                       if (ret) {
21100 +                               pr_info("wakeup trace: Couldn't assign "
21101 +                                   "probe_wakeup_latency_hist_stop "
21102 +                                   "to trace_sched_switch\n");
21103 +                               unregister_trace_sched_wakeup(
21104 +                                   probe_wakeup_latency_hist_start, NULL);
21105 +                               unregister_trace_sched_wakeup_new(
21106 +                                   probe_wakeup_latency_hist_start, NULL);
21107 +                               return ret;
21108 +                       }
21109 +                       ret = register_trace_sched_migrate_task(
21110 +                           probe_sched_migrate_task, NULL);
21111 +                       if (ret) {
21112 +                               pr_info("wakeup trace: Couldn't assign "
21113 +                                   "probe_sched_migrate_task "
21114 +                                   "to trace_sched_migrate_task\n");
21115 +                               unregister_trace_sched_wakeup(
21116 +                                   probe_wakeup_latency_hist_start, NULL);
21117 +                               unregister_trace_sched_wakeup_new(
21118 +                                   probe_wakeup_latency_hist_start, NULL);
21119 +                               unregister_trace_sched_switch(
21120 +                                   probe_wakeup_latency_hist_stop, NULL);
21121 +                               return ret;
21122 +                       }
21123 +                       break;
21124 +#endif
21125 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21126 +               case MISSED_TIMER_OFFSETS:
21127 +                       ret = register_trace_hrtimer_interrupt(
21128 +                           probe_hrtimer_interrupt, NULL);
21129 +                       if (ret) {
21130 +                               pr_info("wakeup trace: Couldn't assign "
21131 +                                   "probe_hrtimer_interrupt "
21132 +                                   "to trace_hrtimer_interrupt\n");
21133 +                               return ret;
21134 +                       }
21135 +                       break;
21136 +#endif
21137 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21138 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21139 +               case TIMERANDWAKEUP_LATENCY:
21140 +                       if (!wakeup_latency_enabled_data.enabled ||
21141 +                           !missed_timer_offsets_enabled_data.enabled)
21142 +                               return -EINVAL;
21143 +                       break;
21144 +#endif
21145 +               default:
21146 +                       break;
21147 +               }
21148 +       } else {
21149 +               switch (ed->latency_type) {
21150 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21151 +               case PREEMPTIRQSOFF_LATENCY:
21152 +                       {
21153 +                               int cpu;
21154 +
21155 +                               unregister_trace_preemptirqsoff_hist(
21156 +                                   probe_preemptirqsoff_hist, NULL);
21157 +                               for_each_online_cpu(cpu) {
21158 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21159 +                                       per_cpu(hist_irqsoff_counting,
21160 +                                           cpu) = 0;
21161 +#endif
21162 +#ifdef CONFIG_PREEMPT_OFF_HIST
21163 +                                       per_cpu(hist_preemptoff_counting,
21164 +                                           cpu) = 0;
21165 +#endif
21166 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21167 +                                       per_cpu(hist_preemptirqsoff_counting,
21168 +                                           cpu) = 0;
21169 +#endif
21170 +                               }
21171 +                       }
21172 +                       break;
21173 +#endif
21174 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21175 +               case WAKEUP_LATENCY:
21176 +                       {
21177 +                               int cpu;
21178 +
21179 +                               unregister_trace_sched_wakeup(
21180 +                                   probe_wakeup_latency_hist_start, NULL);
21181 +                               unregister_trace_sched_wakeup_new(
21182 +                                   probe_wakeup_latency_hist_start, NULL);
21183 +                               unregister_trace_sched_switch(
21184 +                                   probe_wakeup_latency_hist_stop, NULL);
21185 +                               unregister_trace_sched_migrate_task(
21186 +                                   probe_sched_migrate_task, NULL);
21187 +
21188 +                               for_each_online_cpu(cpu) {
21189 +                                       per_cpu(wakeup_task, cpu) = NULL;
21190 +                                       per_cpu(wakeup_sharedprio, cpu) = 0;
21191 +                               }
21192 +                       }
21193 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21194 +                       timerandwakeup_enabled_data.enabled = 0;
21195 +#endif
21196 +                       break;
21197 +#endif
21198 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21199 +               case MISSED_TIMER_OFFSETS:
21200 +                       unregister_trace_hrtimer_interrupt(
21201 +                           probe_hrtimer_interrupt, NULL);
21202 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21203 +                       timerandwakeup_enabled_data.enabled = 0;
21204 +#endif
21205 +                       break;
21206 +#endif
21207 +               default:
21208 +                       break;
21209 +               }
21210 +       }
21211 +       ed->enabled = enable;
21212 +       return cnt;
21213 +}
21214 +
21215 +static const struct file_operations latency_hist_reset_fops = {
21216 +       .open = tracing_open_generic,
21217 +       .write = latency_hist_reset,
21218 +};
21219 +
21220 +static const struct file_operations enable_fops = {
21221 +       .open = tracing_open_generic,
21222 +       .read = show_enable,
21223 +       .write = do_enable,
21224 +};
21225 +
21226 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21227 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21228 +static const struct file_operations pid_fops = {
21229 +       .open = tracing_open_generic,
21230 +       .read = show_pid,
21231 +       .write = do_pid,
21232 +};
21233 +
21234 +static const struct file_operations maxlatproc_fops = {
21235 +       .open = tracing_open_generic,
21236 +       .read = show_maxlatproc,
21237 +};
21238 +#endif
21239 +
21240 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21241 +static notrace void probe_preemptirqsoff_hist(void *v, int reason,
21242 +       int starthist)
21243 +{
21244 +       int cpu = raw_smp_processor_id();
21245 +       int time_set = 0;
21246 +
21247 +       if (starthist) {
21248 +               cycle_t uninitialized_var(start);
21249 +
21250 +               if (!preempt_count() && !irqs_disabled())
21251 +                       return;
21252 +
21253 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21254 +               if ((reason == IRQS_OFF || reason == TRACE_START) &&
21255 +                   !per_cpu(hist_irqsoff_counting, cpu)) {
21256 +                       per_cpu(hist_irqsoff_counting, cpu) = 1;
21257 +                       start = ftrace_now(cpu);
21258 +                       time_set++;
21259 +                       per_cpu(hist_irqsoff_start, cpu) = start;
21260 +               }
21261 +#endif
21262 +
21263 +#ifdef CONFIG_PREEMPT_OFF_HIST
21264 +               if ((reason == PREEMPT_OFF || reason == TRACE_START) &&
21265 +                   !per_cpu(hist_preemptoff_counting, cpu)) {
21266 +                       per_cpu(hist_preemptoff_counting, cpu) = 1;
21267 +                       if (!(time_set++))
21268 +                               start = ftrace_now(cpu);
21269 +                       per_cpu(hist_preemptoff_start, cpu) = start;
21270 +               }
21271 +#endif
21272 +
21273 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21274 +               if (per_cpu(hist_irqsoff_counting, cpu) &&
21275 +                   per_cpu(hist_preemptoff_counting, cpu) &&
21276 +                   !per_cpu(hist_preemptirqsoff_counting, cpu)) {
21277 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 1;
21278 +                       if (!time_set)
21279 +                               start = ftrace_now(cpu);
21280 +                       per_cpu(hist_preemptirqsoff_start, cpu) = start;
21281 +               }
21282 +#endif
21283 +       } else {
21284 +               cycle_t uninitialized_var(stop);
21285 +
21286 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21287 +               if ((reason == IRQS_ON || reason == TRACE_STOP) &&
21288 +                   per_cpu(hist_irqsoff_counting, cpu)) {
21289 +                       cycle_t start = per_cpu(hist_irqsoff_start, cpu);
21290 +
21291 +                       stop = ftrace_now(cpu);
21292 +                       time_set++;
21293 +                       if (start) {
21294 +                               long latency = ((long) (stop - start)) /
21295 +                                   NSECS_PER_USECS;
21296 +
21297 +                               latency_hist(IRQSOFF_LATENCY, cpu, latency, 0,
21298 +                                   stop, NULL);
21299 +                       }
21300 +                       per_cpu(hist_irqsoff_counting, cpu) = 0;
21301 +               }
21302 +#endif
21303 +
21304 +#ifdef CONFIG_PREEMPT_OFF_HIST
21305 +               if ((reason == PREEMPT_ON || reason == TRACE_STOP) &&
21306 +                   per_cpu(hist_preemptoff_counting, cpu)) {
21307 +                       cycle_t start = per_cpu(hist_preemptoff_start, cpu);
21308 +
21309 +                       if (!(time_set++))
21310 +                               stop = ftrace_now(cpu);
21311 +                       if (start) {
21312 +                               long latency = ((long) (stop - start)) /
21313 +                                   NSECS_PER_USECS;
21314 +
21315 +                               latency_hist(PREEMPTOFF_LATENCY, cpu, latency,
21316 +                                   0, stop, NULL);
21317 +                       }
21318 +                       per_cpu(hist_preemptoff_counting, cpu) = 0;
21319 +               }
21320 +#endif
21321 +
21322 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21323 +               if ((!per_cpu(hist_irqsoff_counting, cpu) ||
21324 +                    !per_cpu(hist_preemptoff_counting, cpu)) &&
21325 +                  per_cpu(hist_preemptirqsoff_counting, cpu)) {
21326 +                       cycle_t start = per_cpu(hist_preemptirqsoff_start, cpu);
21327 +
21328 +                       if (!time_set)
21329 +                               stop = ftrace_now(cpu);
21330 +                       if (start) {
21331 +                               long latency = ((long) (stop - start)) /
21332 +                                   NSECS_PER_USECS;
21333 +
21334 +                               latency_hist(PREEMPTIRQSOFF_LATENCY, cpu,
21335 +                                   latency, 0, stop, NULL);
21336 +                       }
21337 +                       per_cpu(hist_preemptirqsoff_counting, cpu) = 0;
21338 +               }
21339 +#endif
21340 +       }
21341 +}
21342 +#endif
21343 +
21344 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21345 +static DEFINE_RAW_SPINLOCK(wakeup_lock);
21346 +static notrace void probe_sched_migrate_task(void *v, struct task_struct *task,
21347 +       int cpu)
21348 +{
21349 +       int old_cpu = task_cpu(task);
21350 +
21351 +       if (cpu != old_cpu) {
21352 +               unsigned long flags;
21353 +               struct task_struct *cpu_wakeup_task;
21354 +
21355 +               raw_spin_lock_irqsave(&wakeup_lock, flags);
21356 +
21357 +               cpu_wakeup_task = per_cpu(wakeup_task, old_cpu);
21358 +               if (task == cpu_wakeup_task) {
21359 +                       put_task_struct(cpu_wakeup_task);
21360 +                       per_cpu(wakeup_task, old_cpu) = NULL;
21361 +                       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = task;
21362 +                       get_task_struct(cpu_wakeup_task);
21363 +               }
21364 +
21365 +               raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21366 +       }
21367 +}
21368 +
21369 +static notrace void probe_wakeup_latency_hist_start(void *v,
21370 +       struct task_struct *p)
21371 +{
21372 +       unsigned long flags;
21373 +       struct task_struct *curr = current;
21374 +       int cpu = task_cpu(p);
21375 +       struct task_struct *cpu_wakeup_task;
21376 +
21377 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
21378 +
21379 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
21380 +
21381 +       if (wakeup_pid) {
21382 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
21383 +                   p->prio == curr->prio)
21384 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21385 +               if (likely(wakeup_pid != task_pid_nr(p)))
21386 +                       goto out;
21387 +       } else {
21388 +               if (likely(!rt_task(p)) ||
21389 +                   (cpu_wakeup_task && p->prio > cpu_wakeup_task->prio) ||
21390 +                   p->prio > curr->prio)
21391 +                       goto out;
21392 +               if ((cpu_wakeup_task && p->prio == cpu_wakeup_task->prio) ||
21393 +                   p->prio == curr->prio)
21394 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21395 +       }
21396 +
21397 +       if (cpu_wakeup_task)
21398 +               put_task_struct(cpu_wakeup_task);
21399 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu) = p;
21400 +       get_task_struct(cpu_wakeup_task);
21401 +       cpu_wakeup_task->preempt_timestamp_hist =
21402 +               ftrace_now(raw_smp_processor_id());
21403 +out:
21404 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21405 +}
21406 +
21407 +static notrace void probe_wakeup_latency_hist_stop(void *v,
21408 +       bool preempt, struct task_struct *prev, struct task_struct *next)
21409 +{
21410 +       unsigned long flags;
21411 +       int cpu = task_cpu(next);
21412 +       long latency;
21413 +       cycle_t stop;
21414 +       struct task_struct *cpu_wakeup_task;
21415 +
21416 +       raw_spin_lock_irqsave(&wakeup_lock, flags);
21417 +
21418 +       cpu_wakeup_task = per_cpu(wakeup_task, cpu);
21419 +
21420 +       if (cpu_wakeup_task == NULL)
21421 +               goto out;
21422 +
21423 +       /* Already running? */
21424 +       if (unlikely(current == cpu_wakeup_task))
21425 +               goto out_reset;
21426 +
21427 +       if (next != cpu_wakeup_task) {
21428 +               if (next->prio < cpu_wakeup_task->prio)
21429 +                       goto out_reset;
21430 +
21431 +               if (next->prio == cpu_wakeup_task->prio)
21432 +                       per_cpu(wakeup_sharedprio, cpu) = 1;
21433 +
21434 +               goto out;
21435 +       }
21436 +
21437 +       if (current->prio == cpu_wakeup_task->prio)
21438 +               per_cpu(wakeup_sharedprio, cpu) = 1;
21439 +
21440 +       /*
21441 +        * The task we are waiting for is about to be switched to.
21442 +        * Calculate latency and store it in histogram.
21443 +        */
21444 +       stop = ftrace_now(raw_smp_processor_id());
21445 +
21446 +       latency = ((long) (stop - next->preempt_timestamp_hist)) /
21447 +           NSECS_PER_USECS;
21448 +
21449 +       if (per_cpu(wakeup_sharedprio, cpu)) {
21450 +               latency_hist(WAKEUP_LATENCY_SHAREDPRIO, cpu, latency, 0, stop,
21451 +                   next);
21452 +               per_cpu(wakeup_sharedprio, cpu) = 0;
21453 +       } else {
21454 +               latency_hist(WAKEUP_LATENCY, cpu, latency, 0, stop, next);
21455 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21456 +               if (timerandwakeup_enabled_data.enabled) {
21457 +                       latency_hist(TIMERANDWAKEUP_LATENCY, cpu,
21458 +                           next->timer_offset + latency, next->timer_offset,
21459 +                           stop, next);
21460 +               }
21461 +#endif
21462 +       }
21463 +
21464 +out_reset:
21465 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21466 +       next->timer_offset = 0;
21467 +#endif
21468 +       put_task_struct(cpu_wakeup_task);
21469 +       per_cpu(wakeup_task, cpu) = NULL;
21470 +out:
21471 +       raw_spin_unlock_irqrestore(&wakeup_lock, flags);
21472 +}
21473 +#endif
21474 +
21475 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21476 +static notrace void probe_hrtimer_interrupt(void *v, int cpu,
21477 +       long long latency_ns, struct task_struct *curr,
21478 +       struct task_struct *task)
21479 +{
21480 +       if (latency_ns <= 0 && task != NULL && rt_task(task) &&
21481 +           (task->prio < curr->prio ||
21482 +           (task->prio == curr->prio &&
21483 +           !cpumask_test_cpu(cpu, &task->cpus_allowed)))) {
21484 +               long latency;
21485 +               cycle_t now;
21486 +
21487 +               if (missed_timer_offsets_pid) {
21488 +                       if (likely(missed_timer_offsets_pid !=
21489 +                           task_pid_nr(task)))
21490 +                               return;
21491 +               }
21492 +
21493 +               now = ftrace_now(cpu);
21494 +               latency = (long) div_s64(-latency_ns, NSECS_PER_USECS);
21495 +               latency_hist(MISSED_TIMER_OFFSETS, cpu, latency, latency, now,
21496 +                   task);
21497 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21498 +               task->timer_offset = latency;
21499 +#endif
21500 +       }
21501 +}
21502 +#endif
21503 +
21504 +static __init int latency_hist_init(void)
21505 +{
21506 +       struct dentry *latency_hist_root = NULL;
21507 +       struct dentry *dentry;
21508 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21509 +       struct dentry *dentry_sharedprio;
21510 +#endif
21511 +       struct dentry *entry;
21512 +       struct dentry *enable_root;
21513 +       int i = 0;
21514 +       struct hist_data *my_hist;
21515 +       char name[64];
21516 +       char *cpufmt = "CPU%d";
21517 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) || \
21518 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21519 +       char *cpufmt_maxlatproc = "max_latency-CPU%d";
21520 +       struct maxlatproc_data *mp = NULL;
21521 +#endif
21522 +
21523 +       dentry = tracing_init_dentry();
21524 +       latency_hist_root = debugfs_create_dir(latency_hist_dir_root, dentry);
21525 +       enable_root = debugfs_create_dir("enable", latency_hist_root);
21526 +
21527 +#ifdef CONFIG_INTERRUPT_OFF_HIST
21528 +       dentry = debugfs_create_dir(irqsoff_hist_dir, latency_hist_root);
21529 +       for_each_possible_cpu(i) {
21530 +               sprintf(name, cpufmt, i);
21531 +               entry = debugfs_create_file(name, 0444, dentry,
21532 +                   &per_cpu(irqsoff_hist, i), &latency_hist_fops);
21533 +               my_hist = &per_cpu(irqsoff_hist, i);
21534 +               atomic_set(&my_hist->hist_mode, 1);
21535 +               my_hist->min_lat = LONG_MAX;
21536 +       }
21537 +       entry = debugfs_create_file("reset", 0644, dentry,
21538 +           (void *)IRQSOFF_LATENCY, &latency_hist_reset_fops);
21539 +#endif
21540 +
21541 +#ifdef CONFIG_PREEMPT_OFF_HIST
21542 +       dentry = debugfs_create_dir(preemptoff_hist_dir,
21543 +           latency_hist_root);
21544 +       for_each_possible_cpu(i) {
21545 +               sprintf(name, cpufmt, i);
21546 +               entry = debugfs_create_file(name, 0444, dentry,
21547 +                   &per_cpu(preemptoff_hist, i), &latency_hist_fops);
21548 +               my_hist = &per_cpu(preemptoff_hist, i);
21549 +               atomic_set(&my_hist->hist_mode, 1);
21550 +               my_hist->min_lat = LONG_MAX;
21551 +       }
21552 +       entry = debugfs_create_file("reset", 0644, dentry,
21553 +           (void *)PREEMPTOFF_LATENCY, &latency_hist_reset_fops);
21554 +#endif
21555 +
21556 +#if defined(CONFIG_INTERRUPT_OFF_HIST) && defined(CONFIG_PREEMPT_OFF_HIST)
21557 +       dentry = debugfs_create_dir(preemptirqsoff_hist_dir,
21558 +           latency_hist_root);
21559 +       for_each_possible_cpu(i) {
21560 +               sprintf(name, cpufmt, i);
21561 +               entry = debugfs_create_file(name, 0444, dentry,
21562 +                   &per_cpu(preemptirqsoff_hist, i), &latency_hist_fops);
21563 +               my_hist = &per_cpu(preemptirqsoff_hist, i);
21564 +               atomic_set(&my_hist->hist_mode, 1);
21565 +               my_hist->min_lat = LONG_MAX;
21566 +       }
21567 +       entry = debugfs_create_file("reset", 0644, dentry,
21568 +           (void *)PREEMPTIRQSOFF_LATENCY, &latency_hist_reset_fops);
21569 +#endif
21570 +
21571 +#if defined(CONFIG_INTERRUPT_OFF_HIST) || defined(CONFIG_PREEMPT_OFF_HIST)
21572 +       entry = debugfs_create_file("preemptirqsoff", 0644,
21573 +           enable_root, (void *)&preemptirqsoff_enabled_data,
21574 +           &enable_fops);
21575 +#endif
21576 +
21577 +#ifdef CONFIG_WAKEUP_LATENCY_HIST
21578 +       dentry = debugfs_create_dir(wakeup_latency_hist_dir,
21579 +           latency_hist_root);
21580 +       dentry_sharedprio = debugfs_create_dir(
21581 +           wakeup_latency_hist_dir_sharedprio, dentry);
21582 +       for_each_possible_cpu(i) {
21583 +               sprintf(name, cpufmt, i);
21584 +
21585 +               entry = debugfs_create_file(name, 0444, dentry,
21586 +                   &per_cpu(wakeup_latency_hist, i),
21587 +                   &latency_hist_fops);
21588 +               my_hist = &per_cpu(wakeup_latency_hist, i);
21589 +               atomic_set(&my_hist->hist_mode, 1);
21590 +               my_hist->min_lat = LONG_MAX;
21591 +
21592 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio,
21593 +                   &per_cpu(wakeup_latency_hist_sharedprio, i),
21594 +                   &latency_hist_fops);
21595 +               my_hist = &per_cpu(wakeup_latency_hist_sharedprio, i);
21596 +               atomic_set(&my_hist->hist_mode, 1);
21597 +               my_hist->min_lat = LONG_MAX;
21598 +
21599 +               sprintf(name, cpufmt_maxlatproc, i);
21600 +
21601 +               mp = &per_cpu(wakeup_maxlatproc, i);
21602 +               entry = debugfs_create_file(name, 0444, dentry, mp,
21603 +                   &maxlatproc_fops);
21604 +               clear_maxlatprocdata(mp);
21605 +
21606 +               mp = &per_cpu(wakeup_maxlatproc_sharedprio, i);
21607 +               entry = debugfs_create_file(name, 0444, dentry_sharedprio, mp,
21608 +                   &maxlatproc_fops);
21609 +               clear_maxlatprocdata(mp);
21610 +       }
21611 +       entry = debugfs_create_file("pid", 0644, dentry,
21612 +           (void *)&wakeup_pid, &pid_fops);
21613 +       entry = debugfs_create_file("reset", 0644, dentry,
21614 +           (void *)WAKEUP_LATENCY, &latency_hist_reset_fops);
21615 +       entry = debugfs_create_file("reset", 0644, dentry_sharedprio,
21616 +           (void *)WAKEUP_LATENCY_SHAREDPRIO, &latency_hist_reset_fops);
21617 +       entry = debugfs_create_file("wakeup", 0644,
21618 +           enable_root, (void *)&wakeup_latency_enabled_data,
21619 +           &enable_fops);
21620 +#endif
21621 +
21622 +#ifdef CONFIG_MISSED_TIMER_OFFSETS_HIST
21623 +       dentry = debugfs_create_dir(missed_timer_offsets_dir,
21624 +           latency_hist_root);
21625 +       for_each_possible_cpu(i) {
21626 +               sprintf(name, cpufmt, i);
21627 +               entry = debugfs_create_file(name, 0444, dentry,
21628 +                   &per_cpu(missed_timer_offsets, i), &latency_hist_fops);
21629 +               my_hist = &per_cpu(missed_timer_offsets, i);
21630 +               atomic_set(&my_hist->hist_mode, 1);
21631 +               my_hist->min_lat = LONG_MAX;
21632 +
21633 +               sprintf(name, cpufmt_maxlatproc, i);
21634 +               mp = &per_cpu(missed_timer_offsets_maxlatproc, i);
21635 +               entry = debugfs_create_file(name, 0444, dentry, mp,
21636 +                   &maxlatproc_fops);
21637 +               clear_maxlatprocdata(mp);
21638 +       }
21639 +       entry = debugfs_create_file("pid", 0644, dentry,
21640 +           (void *)&missed_timer_offsets_pid, &pid_fops);
21641 +       entry = debugfs_create_file("reset", 0644, dentry,
21642 +           (void *)MISSED_TIMER_OFFSETS, &latency_hist_reset_fops);
21643 +       entry = debugfs_create_file("missed_timer_offsets", 0644,
21644 +           enable_root, (void *)&missed_timer_offsets_enabled_data,
21645 +           &enable_fops);
21646 +#endif
21647 +
21648 +#if defined(CONFIG_WAKEUP_LATENCY_HIST) && \
21649 +       defined(CONFIG_MISSED_TIMER_OFFSETS_HIST)
21650 +       dentry = debugfs_create_dir(timerandwakeup_latency_hist_dir,
21651 +           latency_hist_root);
21652 +       for_each_possible_cpu(i) {
21653 +               sprintf(name, cpufmt, i);
21654 +               entry = debugfs_create_file(name, 0444, dentry,
21655 +                   &per_cpu(timerandwakeup_latency_hist, i),
21656 +                   &latency_hist_fops);
21657 +               my_hist = &per_cpu(timerandwakeup_latency_hist, i);
21658 +               atomic_set(&my_hist->hist_mode, 1);
21659 +               my_hist->min_lat = LONG_MAX;
21660 +
21661 +               sprintf(name, cpufmt_maxlatproc, i);
21662 +               mp = &per_cpu(timerandwakeup_maxlatproc, i);
21663 +               entry = debugfs_create_file(name, 0444, dentry, mp,
21664 +                   &maxlatproc_fops);
21665 +               clear_maxlatprocdata(mp);
21666 +       }
21667 +       entry = debugfs_create_file("reset", 0644, dentry,
21668 +           (void *)TIMERANDWAKEUP_LATENCY, &latency_hist_reset_fops);
21669 +       entry = debugfs_create_file("timerandwakeup", 0644,
21670 +           enable_root, (void *)&timerandwakeup_enabled_data,
21671 +           &enable_fops);
21672 +#endif
21673 +       return 0;
21674 +}
21675 +
21676 +device_initcall(latency_hist_init);
21677 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
21678 index 7bc56762ca35..84ffcb813263 100644
21679 --- a/kernel/trace/trace.c
21680 +++ b/kernel/trace/trace.c
21681 @@ -1897,6 +1897,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
21682         struct task_struct *tsk = current;
21683
21684         entry->preempt_count            = pc & 0xff;
21685 +       entry->preempt_lazy_count       = preempt_lazy_count();
21686         entry->pid                      = (tsk) ? tsk->pid : 0;
21687         entry->flags =
21688  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
21689 @@ -1907,8 +1908,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
21690                 ((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
21691                 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
21692                 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
21693 -               (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
21694 +               (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) |
21695 +               (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) |
21696                 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
21697 +
21698 +       entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0;
21699  }
21700  EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
21701
21702 @@ -2892,14 +2896,17 @@ get_total_entries(struct trace_buffer *buf,
21703
21704  static void print_lat_help_header(struct seq_file *m)
21705  {
21706 -       seq_puts(m, "#                  _------=> CPU#            \n"
21707 -                   "#                 / _-----=> irqs-off        \n"
21708 -                   "#                | / _----=> need-resched    \n"
21709 -                   "#                || / _---=> hardirq/softirq \n"
21710 -                   "#                ||| / _--=> preempt-depth   \n"
21711 -                   "#                |||| /     delay            \n"
21712 -                   "#  cmd     pid   ||||| time  |   caller      \n"
21713 -                   "#     \\   /      |||||  \\    |   /         \n");
21714 +       seq_puts(m, "#                  _--------=> CPU#              \n"
21715 +                   "#                 / _-------=> irqs-off          \n"
21716 +                   "#                | / _------=> need-resched      \n"
21717 +                   "#                || / _-----=> need-resched_lazy \n"
21718 +                   "#                ||| / _----=> hardirq/softirq   \n"
21719 +                   "#                |||| / _---=> preempt-depth     \n"
21720 +                   "#                ||||| / _--=> preempt-lazy-depth\n"
21721 +                   "#                |||||| / _-=> migrate-disable   \n"
21722 +                   "#                ||||||| /     delay             \n"
21723 +                   "# cmd     pid    |||||||| time   |  caller       \n"
21724 +                   "#     \\   /      ||||||||   \\    |  /            \n");
21725  }
21726
21727  static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
21728 @@ -2925,11 +2932,14 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
21729         print_event_info(buf, m);
21730         seq_puts(m, "#                              _-----=> irqs-off\n"
21731                     "#                             / _----=> need-resched\n"
21732 -                   "#                            | / _---=> hardirq/softirq\n"
21733 -                   "#                            || / _--=> preempt-depth\n"
21734 -                   "#                            ||| /     delay\n"
21735 -                   "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
21736 -                   "#              | |       |   ||||       |         |\n");
21737 +                   "#                            |/  _-----=> need-resched_lazy\n"
21738 +                   "#                            || / _---=> hardirq/softirq\n"
21739 +                   "#                            ||| / _--=> preempt-depth\n"
21740 +                   "#                            |||| / _-=> preempt-lazy-depth\n"
21741 +                   "#                            ||||| / _-=> migrate-disable   \n"
21742 +                   "#                            |||||| /    delay\n"
21743 +                   "#           TASK-PID   CPU#  |||||||   TIMESTAMP  FUNCTION\n"
21744 +                   "#              | |       |   |||||||      |         |\n");
21745  }
21746
21747  void
21748 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
21749 index f783df416726..6f2d0fa4fbf1 100644
21750 --- a/kernel/trace/trace.h
21751 +++ b/kernel/trace/trace.h
21752 @@ -123,6 +123,7 @@ struct kretprobe_trace_entry_head {
21753   *  NEED_RESCHED       - reschedule is requested
21754   *  HARDIRQ            - inside an interrupt handler
21755   *  SOFTIRQ            - inside a softirq handler
21756 + *  NEED_RESCHED_LAZY  - lazy reschedule is requested
21757   */
21758  enum trace_flag_type {
21759         TRACE_FLAG_IRQS_OFF             = 0x01,
21760 @@ -132,6 +133,7 @@ enum trace_flag_type {
21761         TRACE_FLAG_SOFTIRQ              = 0x10,
21762         TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
21763         TRACE_FLAG_NMI                  = 0x40,
21764 +       TRACE_FLAG_NEED_RESCHED_LAZY    = 0x80,
21765  };
21766
21767  #define TRACE_BUF_SIZE         1024
21768 diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
21769 index 03c0a48c3ac4..0b85d516b491 100644
21770 --- a/kernel/trace/trace_events.c
21771 +++ b/kernel/trace/trace_events.c
21772 @@ -187,6 +187,8 @@ static int trace_define_common_fields(void)
21773         __common_field(unsigned char, flags);
21774         __common_field(unsigned char, preempt_count);
21775         __common_field(int, pid);
21776 +       __common_field(unsigned short, migrate_disable);
21777 +       __common_field(unsigned short, padding);
21778
21779         return ret;
21780  }
21781 diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
21782 index 03cdff84d026..940bd10b4406 100644
21783 --- a/kernel/trace/trace_irqsoff.c
21784 +++ b/kernel/trace/trace_irqsoff.c
21785 @@ -13,6 +13,7 @@
21786  #include <linux/uaccess.h>
21787  #include <linux/module.h>
21788  #include <linux/ftrace.h>
21789 +#include <trace/events/hist.h>
21790
21791  #include "trace.h"
21792
21793 @@ -424,11 +425,13 @@ void start_critical_timings(void)
21794  {
21795         if (preempt_trace() || irq_trace())
21796                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21797 +       trace_preemptirqsoff_hist_rcuidle(TRACE_START, 1);
21798  }
21799  EXPORT_SYMBOL_GPL(start_critical_timings);
21800
21801  void stop_critical_timings(void)
21802  {
21803 +       trace_preemptirqsoff_hist_rcuidle(TRACE_STOP, 0);
21804         if (preempt_trace() || irq_trace())
21805                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21806  }
21807 @@ -438,6 +441,7 @@ EXPORT_SYMBOL_GPL(stop_critical_timings);
21808  #ifdef CONFIG_PROVE_LOCKING
21809  void time_hardirqs_on(unsigned long a0, unsigned long a1)
21810  {
21811 +       trace_preemptirqsoff_hist_rcuidle(IRQS_ON, 0);
21812         if (!preempt_trace() && irq_trace())
21813                 stop_critical_timing(a0, a1);
21814  }
21815 @@ -446,6 +450,7 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
21816  {
21817         if (!preempt_trace() && irq_trace())
21818                 start_critical_timing(a0, a1);
21819 +       trace_preemptirqsoff_hist_rcuidle(IRQS_OFF, 1);
21820  }
21821
21822  #else /* !CONFIG_PROVE_LOCKING */
21823 @@ -471,6 +476,7 @@ inline void print_irqtrace_events(struct task_struct *curr)
21824   */
21825  void trace_hardirqs_on(void)
21826  {
21827 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
21828         if (!preempt_trace() && irq_trace())
21829                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21830  }
21831 @@ -480,11 +486,13 @@ void trace_hardirqs_off(void)
21832  {
21833         if (!preempt_trace() && irq_trace())
21834                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
21835 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
21836  }
21837  EXPORT_SYMBOL(trace_hardirqs_off);
21838
21839  __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
21840  {
21841 +       trace_preemptirqsoff_hist(IRQS_ON, 0);
21842         if (!preempt_trace() && irq_trace())
21843                 stop_critical_timing(CALLER_ADDR0, caller_addr);
21844  }
21845 @@ -494,6 +502,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
21846  {
21847         if (!preempt_trace() && irq_trace())
21848                 start_critical_timing(CALLER_ADDR0, caller_addr);
21849 +       trace_preemptirqsoff_hist(IRQS_OFF, 1);
21850  }
21851  EXPORT_SYMBOL(trace_hardirqs_off_caller);
21852
21853 @@ -503,12 +512,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
21854  #ifdef CONFIG_PREEMPT_TRACER
21855  void trace_preempt_on(unsigned long a0, unsigned long a1)
21856  {
21857 +       trace_preemptirqsoff_hist(PREEMPT_ON, 0);
21858         if (preempt_trace() && !irq_trace())
21859                 stop_critical_timing(a0, a1);
21860  }
21861
21862  void trace_preempt_off(unsigned long a0, unsigned long a1)
21863  {
21864 +       trace_preemptirqsoff_hist(PREEMPT_ON, 1);
21865         if (preempt_trace() && !irq_trace())
21866                 start_critical_timing(a0, a1);
21867  }
21868 diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
21869 index 0bb9cf2d53e6..455a7464772f 100644
21870 --- a/kernel/trace/trace_output.c
21871 +++ b/kernel/trace/trace_output.c
21872 @@ -386,6 +386,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
21873  {
21874         char hardsoft_irq;
21875         char need_resched;
21876 +       char need_resched_lazy;
21877         char irqs_off;
21878         int hardirq;
21879         int softirq;
21880 @@ -416,6 +417,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
21881                 break;
21882         }
21883
21884 +       need_resched_lazy =
21885 +               (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
21886 +
21887         hardsoft_irq =
21888                 (nmi && hardirq)     ? 'Z' :
21889                 nmi                  ? 'z' :
21890 @@ -424,14 +428,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
21891                 softirq              ? 's' :
21892                                        '.' ;
21893
21894 -       trace_seq_printf(s, "%c%c%c",
21895 -                        irqs_off, need_resched, hardsoft_irq);
21896 +       trace_seq_printf(s, "%c%c%c%c",
21897 +                        irqs_off, need_resched, need_resched_lazy,
21898 +                        hardsoft_irq);
21899
21900         if (entry->preempt_count)
21901                 trace_seq_printf(s, "%x", entry->preempt_count);
21902         else
21903                 trace_seq_putc(s, '.');
21904
21905 +       if (entry->preempt_lazy_count)
21906 +               trace_seq_printf(s, "%x", entry->preempt_lazy_count);
21907 +       else
21908 +               trace_seq_putc(s, '.');
21909 +
21910 +       if (entry->migrate_disable)
21911 +               trace_seq_printf(s, "%x", entry->migrate_disable);
21912 +       else
21913 +               trace_seq_putc(s, '.');
21914 +
21915         return !trace_seq_has_overflowed(s);
21916  }
21917
21918 diff --git a/kernel/user.c b/kernel/user.c
21919 index b069ccbfb0b0..1a2e88e98b5e 100644
21920 --- a/kernel/user.c
21921 +++ b/kernel/user.c
21922 @@ -161,11 +161,11 @@ void free_uid(struct user_struct *up)
21923         if (!up)
21924                 return;
21925
21926 -       local_irq_save(flags);
21927 +       local_irq_save_nort(flags);
21928         if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
21929                 free_user(up, flags);
21930         else
21931 -               local_irq_restore(flags);
21932 +               local_irq_restore_nort(flags);
21933  }
21934
21935  struct user_struct *alloc_uid(kuid_t uid)
21936 diff --git a/kernel/watchdog.c b/kernel/watchdog.c
21937 index 9acb29f280ec..caba62080411 100644
21938 --- a/kernel/watchdog.c
21939 +++ b/kernel/watchdog.c
21940 @@ -315,6 +315,8 @@ static int is_softlockup(unsigned long touch_ts)
21941
21942  #ifdef CONFIG_HARDLOCKUP_DETECTOR
21943
21944 +static DEFINE_RAW_SPINLOCK(watchdog_output_lock);
21945 +
21946  static struct perf_event_attr wd_hw_attr = {
21947         .type           = PERF_TYPE_HARDWARE,
21948         .config         = PERF_COUNT_HW_CPU_CYCLES,
21949 @@ -349,6 +351,13 @@ static void watchdog_overflow_callback(struct perf_event *event,
21950                 /* only print hardlockups once */
21951                 if (__this_cpu_read(hard_watchdog_warn) == true)
21952                         return;
21953 +               /*
21954 +                * If early-printk is enabled then make sure we do not
21955 +                * lock up in printk() and kill console logging:
21956 +                */
21957 +               printk_kill();
21958 +
21959 +               raw_spin_lock(&watchdog_output_lock);
21960
21961                 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
21962                 print_modules();
21963 @@ -366,6 +375,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
21964                                 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
21965                         trigger_allbutself_cpu_backtrace();
21966
21967 +               raw_spin_unlock(&watchdog_output_lock);
21968                 if (hardlockup_panic)
21969                         nmi_panic(regs, "Hard LOCKUP");
21970
21971 @@ -513,6 +523,7 @@ static void watchdog_enable(unsigned int cpu)
21972         /* kick off the timer for the hardlockup detector */
21973         hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
21974         hrtimer->function = watchdog_timer_fn;
21975 +       hrtimer->irqsafe = 1;
21976
21977         /* Enable the perf event */
21978         watchdog_nmi_enable(cpu);
21979 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
21980 index ef071ca73fc3..c7a62d6adb00 100644
21981 --- a/kernel/workqueue.c
21982 +++ b/kernel/workqueue.c
21983 @@ -48,6 +48,8 @@
21984  #include <linux/nodemask.h>
21985  #include <linux/moduleparam.h>
21986  #include <linux/uaccess.h>
21987 +#include <linux/locallock.h>
21988 +#include <linux/delay.h>
21989
21990  #include "workqueue_internal.h"
21991
21992 @@ -121,11 +123,16 @@ enum {
21993   *    cpu or grabbing pool->lock is enough for read access.  If
21994   *    POOL_DISASSOCIATED is set, it's identical to L.
21995   *
21996 + *    On RT we need the extra protection via rt_lock_idle_list() for
21997 + *    the list manipulations against read access from
21998 + *    wq_worker_sleeping(). All other places are nicely serialized via
21999 + *    pool->lock.
22000 + *
22001   * A: pool->attach_mutex protected.
22002   *
22003   * PL: wq_pool_mutex protected.
22004   *
22005 - * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
22006 + * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
22007   *
22008   * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
22009   *
22010 @@ -134,7 +141,7 @@ enum {
22011   *
22012   * WQ: wq->mutex protected.
22013   *
22014 - * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
22015 + * WR: wq->mutex protected for writes.  RCU protected for reads.
22016   *
22017   * MD: wq_mayday_lock protected.
22018   */
22019 @@ -185,7 +192,7 @@ struct worker_pool {
22020         atomic_t                nr_running ____cacheline_aligned_in_smp;
22021
22022         /*
22023 -        * Destruction of pool is sched-RCU protected to allow dereferences
22024 +        * Destruction of pool is RCU protected to allow dereferences
22025          * from get_work_pool().
22026          */
22027         struct rcu_head         rcu;
22028 @@ -214,7 +221,7 @@ struct pool_workqueue {
22029         /*
22030          * Release of unbound pwq is punted to system_wq.  See put_pwq()
22031          * and pwq_unbound_release_workfn() for details.  pool_workqueue
22032 -        * itself is also sched-RCU protected so that the first pwq can be
22033 +        * itself is also RCU protected so that the first pwq can be
22034          * determined without grabbing wq->mutex.
22035          */
22036         struct work_struct      unbound_release_work;
22037 @@ -348,6 +355,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq);
22038  struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
22039  EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
22040
22041 +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
22042 +
22043  static int worker_thread(void *__worker);
22044  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22045
22046 @@ -355,20 +364,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22047  #include <trace/events/workqueue.h>
22048
22049  #define assert_rcu_or_pool_mutex()                                     \
22050 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22051 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22052                          !lockdep_is_held(&wq_pool_mutex),              \
22053 -                        "sched RCU or wq_pool_mutex should be held")
22054 +                        "RCU or wq_pool_mutex should be held")
22055
22056  #define assert_rcu_or_wq_mutex(wq)                                     \
22057 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22058 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22059                          !lockdep_is_held(&wq->mutex),                  \
22060 -                        "sched RCU or wq->mutex should be held")
22061 +                        "RCU or wq->mutex should be held")
22062
22063  #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
22064 -       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() &&                 \
22065 +       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
22066                          !lockdep_is_held(&wq->mutex) &&                \
22067                          !lockdep_is_held(&wq_pool_mutex),              \
22068 -                        "sched RCU, wq->mutex or wq_pool_mutex should be held")
22069 +                        "RCU, wq->mutex or wq_pool_mutex should be held")
22070
22071  #define for_each_cpu_worker_pool(pool, cpu)                            \
22072         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
22073 @@ -380,7 +389,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22074   * @pool: iteration cursor
22075   * @pi: integer used for iteration
22076   *
22077 - * This must be called either with wq_pool_mutex held or sched RCU read
22078 + * This must be called either with wq_pool_mutex held or RCU read
22079   * locked.  If the pool needs to be used beyond the locking in effect, the
22080   * caller is responsible for guaranteeing that the pool stays online.
22081   *
22082 @@ -412,7 +421,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22083   * @pwq: iteration cursor
22084   * @wq: the target workqueue
22085   *
22086 - * This must be called either with wq->mutex held or sched RCU read locked.
22087 + * This must be called either with wq->mutex held or RCU read locked.
22088   * If the pwq needs to be used beyond the locking in effect, the caller is
22089   * responsible for guaranteeing that the pwq stays online.
22090   *
22091 @@ -424,6 +433,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
22092                 if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
22093                 else
22094
22095 +#ifdef CONFIG_PREEMPT_RT_BASE
22096 +static inline void rt_lock_idle_list(struct worker_pool *pool)
22097 +{
22098 +       preempt_disable();
22099 +}
22100 +static inline void rt_unlock_idle_list(struct worker_pool *pool)
22101 +{
22102 +       preempt_enable();
22103 +}
22104 +static inline void sched_lock_idle_list(struct worker_pool *pool) { }
22105 +static inline void sched_unlock_idle_list(struct worker_pool *pool) { }
22106 +#else
22107 +static inline void rt_lock_idle_list(struct worker_pool *pool) { }
22108 +static inline void rt_unlock_idle_list(struct worker_pool *pool) { }
22109 +static inline void sched_lock_idle_list(struct worker_pool *pool)
22110 +{
22111 +       spin_lock_irq(&pool->lock);
22112 +}
22113 +static inline void sched_unlock_idle_list(struct worker_pool *pool)
22114 +{
22115 +       spin_unlock_irq(&pool->lock);
22116 +}
22117 +#endif
22118 +
22119 +
22120  #ifdef CONFIG_DEBUG_OBJECTS_WORK
22121
22122  static struct debug_obj_descr work_debug_descr;
22123 @@ -548,7 +582,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
22124   * @wq: the target workqueue
22125   * @node: the node ID
22126   *
22127 - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
22128 + * This must be called with any of wq_pool_mutex, wq->mutex or RCU
22129   * read locked.
22130   * If the pwq needs to be used beyond the locking in effect, the caller is
22131   * responsible for guaranteeing that the pwq stays online.
22132 @@ -692,8 +726,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
22133   * @work: the work item of interest
22134   *
22135   * Pools are created and destroyed under wq_pool_mutex, and allows read
22136 - * access under sched-RCU read lock.  As such, this function should be
22137 - * called under wq_pool_mutex or with preemption disabled.
22138 + * access under RCU read lock.  As such, this function should be
22139 + * called under wq_pool_mutex or inside of a rcu_read_lock() region.
22140   *
22141   * All fields of the returned pool are accessible as long as the above
22142   * mentioned locking is in effect.  If the returned pool needs to be used
22143 @@ -830,50 +864,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool)
22144   */
22145  static void wake_up_worker(struct worker_pool *pool)
22146  {
22147 -       struct worker *worker = first_idle_worker(pool);
22148 +       struct worker *worker;
22149 +
22150 +       rt_lock_idle_list(pool);
22151 +
22152 +       worker = first_idle_worker(pool);
22153
22154         if (likely(worker))
22155                 wake_up_process(worker->task);
22156 +
22157 +       rt_unlock_idle_list(pool);
22158  }
22159
22160  /**
22161 - * wq_worker_waking_up - a worker is waking up
22162 + * wq_worker_running - a worker is running again
22163   * @task: task waking up
22164 - * @cpu: CPU @task is waking up to
22165   *
22166 - * This function is called during try_to_wake_up() when a worker is
22167 - * being awoken.
22168 - *
22169 - * CONTEXT:
22170 - * spin_lock_irq(rq->lock)
22171 + * This function is called when a worker returns from schedule()
22172   */
22173 -void wq_worker_waking_up(struct task_struct *task, int cpu)
22174 +void wq_worker_running(struct task_struct *task)
22175  {
22176         struct worker *worker = kthread_data(task);
22177
22178 -       if (!(worker->flags & WORKER_NOT_RUNNING)) {
22179 -               WARN_ON_ONCE(worker->pool->cpu != cpu);
22180 +       if (!worker->sleeping)
22181 +               return;
22182 +       if (!(worker->flags & WORKER_NOT_RUNNING))
22183                 atomic_inc(&worker->pool->nr_running);
22184 -       }
22185 +       worker->sleeping = 0;
22186  }
22187
22188  /**
22189   * wq_worker_sleeping - a worker is going to sleep
22190   * @task: task going to sleep
22191   *
22192 - * This function is called during schedule() when a busy worker is
22193 - * going to sleep.  Worker on the same cpu can be woken up by
22194 - * returning pointer to its task.
22195 - *
22196 - * CONTEXT:
22197 - * spin_lock_irq(rq->lock)
22198 - *
22199 - * Return:
22200 - * Worker task on @cpu to wake up, %NULL if none.
22201 + * This function is called from schedule() when a busy worker is
22202 + * going to sleep.
22203   */
22204 -struct task_struct *wq_worker_sleeping(struct task_struct *task)
22205 +void wq_worker_sleeping(struct task_struct *task)
22206  {
22207 -       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
22208 +       struct worker *worker = kthread_data(task);
22209         struct worker_pool *pool;
22210
22211         /*
22212 @@ -882,29 +911,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
22213          * checking NOT_RUNNING.
22214          */
22215         if (worker->flags & WORKER_NOT_RUNNING)
22216 -               return NULL;
22217 +               return;
22218
22219         pool = worker->pool;
22220
22221 -       /* this can only happen on the local cpu */
22222 -       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
22223 -               return NULL;
22224 +       if (WARN_ON_ONCE(worker->sleeping))
22225 +               return;
22226 +
22227 +       worker->sleeping = 1;
22228
22229         /*
22230          * The counterpart of the following dec_and_test, implied mb,
22231          * worklist not empty test sequence is in insert_work().
22232          * Please read comment there.
22233 -        *
22234 -        * NOT_RUNNING is clear.  This means that we're bound to and
22235 -        * running on the local cpu w/ rq lock held and preemption
22236 -        * disabled, which in turn means that none else could be
22237 -        * manipulating idle_list, so dereferencing idle_list without pool
22238 -        * lock is safe.
22239          */
22240         if (atomic_dec_and_test(&pool->nr_running) &&
22241 -           !list_empty(&pool->worklist))
22242 -               to_wakeup = first_idle_worker(pool);
22243 -       return to_wakeup ? to_wakeup->task : NULL;
22244 +           !list_empty(&pool->worklist)) {
22245 +               sched_lock_idle_list(pool);
22246 +               wake_up_worker(pool);
22247 +               sched_unlock_idle_list(pool);
22248 +       }
22249  }
22250
22251  /**
22252 @@ -1098,12 +1124,12 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
22253  {
22254         if (pwq) {
22255                 /*
22256 -                * As both pwqs and pools are sched-RCU protected, the
22257 +                * As both pwqs and pools are RCU protected, the
22258                  * following lock operations are safe.
22259                  */
22260 -               spin_lock_irq(&pwq->pool->lock);
22261 +               local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
22262                 put_pwq(pwq);
22263 -               spin_unlock_irq(&pwq->pool->lock);
22264 +               local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
22265         }
22266  }
22267
22268 @@ -1207,7 +1233,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
22269         struct worker_pool *pool;
22270         struct pool_workqueue *pwq;
22271
22272 -       local_irq_save(*flags);
22273 +       local_lock_irqsave(pendingb_lock, *flags);
22274
22275         /* try to steal the timer if it exists */
22276         if (is_dwork) {
22277 @@ -1226,6 +1252,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
22278         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
22279                 return 0;
22280
22281 +       rcu_read_lock();
22282         /*
22283          * The queueing is in progress, or it is already queued. Try to
22284          * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
22285 @@ -1264,14 +1291,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
22286                 set_work_pool_and_keep_pending(work, pool->id);
22287
22288                 spin_unlock(&pool->lock);
22289 +               rcu_read_unlock();
22290                 return 1;
22291         }
22292         spin_unlock(&pool->lock);
22293  fail:
22294 -       local_irq_restore(*flags);
22295 +       rcu_read_unlock();
22296 +       local_unlock_irqrestore(pendingb_lock, *flags);
22297         if (work_is_canceling(work))
22298                 return -ENOENT;
22299 -       cpu_relax();
22300 +       cpu_chill();
22301         return -EAGAIN;
22302  }
22303
22304 @@ -1373,7 +1402,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22305          * queued or lose PENDING.  Grabbing PENDING and queueing should
22306          * happen with IRQ disabled.
22307          */
22308 -       WARN_ON_ONCE(!irqs_disabled());
22309 +       WARN_ON_ONCE_NONRT(!irqs_disabled());
22310
22311         debug_work_activate(work);
22312
22313 @@ -1381,6 +1410,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22314         if (unlikely(wq->flags & __WQ_DRAINING) &&
22315             WARN_ON_ONCE(!is_chained_work(wq)))
22316                 return;
22317 +       rcu_read_lock();
22318  retry:
22319         if (req_cpu == WORK_CPU_UNBOUND)
22320                 cpu = wq_select_unbound_cpu(raw_smp_processor_id());
22321 @@ -1437,10 +1467,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22322         /* pwq determined, queue */
22323         trace_workqueue_queue_work(req_cpu, pwq, work);
22324
22325 -       if (WARN_ON(!list_empty(&work->entry))) {
22326 -               spin_unlock(&pwq->pool->lock);
22327 -               return;
22328 -       }
22329 +       if (WARN_ON(!list_empty(&work->entry)))
22330 +               goto out;
22331
22332         pwq->nr_in_flight[pwq->work_color]++;
22333         work_flags = work_color_to_flags(pwq->work_color);
22334 @@ -1458,7 +1486,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
22335
22336         insert_work(pwq, work, worklist, work_flags);
22337
22338 +out:
22339         spin_unlock(&pwq->pool->lock);
22340 +       rcu_read_unlock();
22341  }
22342
22343  /**
22344 @@ -1478,14 +1508,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
22345         bool ret = false;
22346         unsigned long flags;
22347
22348 -       local_irq_save(flags);
22349 +       local_lock_irqsave(pendingb_lock,flags);
22350
22351         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
22352                 __queue_work(cpu, wq, work);
22353                 ret = true;
22354         }
22355
22356 -       local_irq_restore(flags);
22357 +       local_unlock_irqrestore(pendingb_lock, flags);
22358         return ret;
22359  }
22360  EXPORT_SYMBOL(queue_work_on);
22361 @@ -1552,14 +1582,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
22362         unsigned long flags;
22363
22364         /* read the comment in __queue_work() */
22365 -       local_irq_save(flags);
22366 +       local_lock_irqsave(pendingb_lock, flags);
22367
22368         if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
22369                 __queue_delayed_work(cpu, wq, dwork, delay);
22370                 ret = true;
22371         }
22372
22373 -       local_irq_restore(flags);
22374 +       local_unlock_irqrestore(pendingb_lock, flags);
22375         return ret;
22376  }
22377  EXPORT_SYMBOL(queue_delayed_work_on);
22378 @@ -1594,7 +1624,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
22379
22380         if (likely(ret >= 0)) {
22381                 __queue_delayed_work(cpu, wq, dwork, delay);
22382 -               local_irq_restore(flags);
22383 +               local_unlock_irqrestore(pendingb_lock, flags);
22384         }
22385
22386         /* -ENOENT from try_to_grab_pending() becomes %true */
22387 @@ -1627,7 +1657,9 @@ static void worker_enter_idle(struct worker *worker)
22388         worker->last_active = jiffies;
22389
22390         /* idle_list is LIFO */
22391 +       rt_lock_idle_list(pool);
22392         list_add(&worker->entry, &pool->idle_list);
22393 +       rt_unlock_idle_list(pool);
22394
22395         if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
22396                 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
22397 @@ -1660,7 +1692,9 @@ static void worker_leave_idle(struct worker *worker)
22398                 return;
22399         worker_clr_flags(worker, WORKER_IDLE);
22400         pool->nr_idle--;
22401 +       rt_lock_idle_list(pool);
22402         list_del_init(&worker->entry);
22403 +       rt_unlock_idle_list(pool);
22404  }
22405
22406  static struct worker *alloc_worker(int node)
22407 @@ -1826,7 +1860,9 @@ static void destroy_worker(struct worker *worker)
22408         pool->nr_workers--;
22409         pool->nr_idle--;
22410
22411 +       rt_lock_idle_list(pool);
22412         list_del_init(&worker->entry);
22413 +       rt_unlock_idle_list(pool);
22414         worker->flags |= WORKER_DIE;
22415         wake_up_process(worker->task);
22416  }
22417 @@ -2785,14 +2821,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
22418
22419         might_sleep();
22420
22421 -       local_irq_disable();
22422 +       rcu_read_lock();
22423         pool = get_work_pool(work);
22424         if (!pool) {
22425 -               local_irq_enable();
22426 +               rcu_read_unlock();
22427                 return false;
22428         }
22429
22430 -       spin_lock(&pool->lock);
22431 +       spin_lock_irq(&pool->lock);
22432         /* see the comment in try_to_grab_pending() with the same code */
22433         pwq = get_work_pwq(work);
22434         if (pwq) {
22435 @@ -2821,10 +2857,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
22436         else
22437                 lock_map_acquire_read(&pwq->wq->lockdep_map);
22438         lock_map_release(&pwq->wq->lockdep_map);
22439 -
22440 +       rcu_read_unlock();
22441         return true;
22442  already_gone:
22443         spin_unlock_irq(&pool->lock);
22444 +       rcu_read_unlock();
22445         return false;
22446  }
22447
22448 @@ -2911,7 +2948,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
22449
22450         /* tell other tasks trying to grab @work to back off */
22451         mark_work_canceling(work);
22452 -       local_irq_restore(flags);
22453 +       local_unlock_irqrestore(pendingb_lock, flags);
22454
22455         flush_work(work);
22456         clear_work_data(work);
22457 @@ -2966,10 +3003,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
22458   */
22459  bool flush_delayed_work(struct delayed_work *dwork)
22460  {
22461 -       local_irq_disable();
22462 +       local_lock_irq(pendingb_lock);
22463         if (del_timer_sync(&dwork->timer))
22464                 __queue_work(dwork->cpu, dwork->wq, &dwork->work);
22465 -       local_irq_enable();
22466 +       local_unlock_irq(pendingb_lock);
22467         return flush_work(&dwork->work);
22468  }
22469  EXPORT_SYMBOL(flush_delayed_work);
22470 @@ -3004,7 +3041,7 @@ bool cancel_delayed_work(struct delayed_work *dwork)
22471
22472         set_work_pool_and_clear_pending(&dwork->work,
22473                                         get_work_pool_id(&dwork->work));
22474 -       local_irq_restore(flags);
22475 +       local_unlock_irqrestore(pendingb_lock, flags);
22476         return ret;
22477  }
22478  EXPORT_SYMBOL(cancel_delayed_work);
22479 @@ -3233,7 +3270,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
22480   * put_unbound_pool - put a worker_pool
22481   * @pool: worker_pool to put
22482   *
22483 - * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
22484 + * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
22485   * safe manner.  get_unbound_pool() calls this function on its failure path
22486   * and this function should be able to release pools which went through,
22487   * successfully or not, init_worker_pool().
22488 @@ -3287,8 +3324,8 @@ static void put_unbound_pool(struct worker_pool *pool)
22489         del_timer_sync(&pool->idle_timer);
22490         del_timer_sync(&pool->mayday_timer);
22491
22492 -       /* sched-RCU protected to allow dereferences from get_work_pool() */
22493 -       call_rcu_sched(&pool->rcu, rcu_free_pool);
22494 +       /* RCU protected to allow dereferences from get_work_pool() */
22495 +       call_rcu(&pool->rcu, rcu_free_pool);
22496  }
22497
22498  /**
22499 @@ -3395,14 +3432,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
22500         put_unbound_pool(pool);
22501         mutex_unlock(&wq_pool_mutex);
22502
22503 -       call_rcu_sched(&pwq->rcu, rcu_free_pwq);
22504 +       call_rcu(&pwq->rcu, rcu_free_pwq);
22505
22506         /*
22507          * If we're the last pwq going away, @wq is already dead and no one
22508          * is gonna access it anymore.  Schedule RCU free.
22509          */
22510         if (is_last)
22511 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
22512 +               call_rcu(&wq->rcu, rcu_free_wq);
22513  }
22514
22515  /**
22516 @@ -4052,7 +4089,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
22517                  * The base ref is never dropped on per-cpu pwqs.  Directly
22518                  * schedule RCU free.
22519                  */
22520 -               call_rcu_sched(&wq->rcu, rcu_free_wq);
22521 +               call_rcu(&wq->rcu, rcu_free_wq);
22522         } else {
22523                 /*
22524                  * We're the sole accessor of @wq at this point.  Directly
22525 @@ -4145,7 +4182,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
22526         struct pool_workqueue *pwq;
22527         bool ret;
22528
22529 -       rcu_read_lock_sched();
22530 +       rcu_read_lock();
22531 +       preempt_disable();
22532
22533         if (cpu == WORK_CPU_UNBOUND)
22534                 cpu = smp_processor_id();
22535 @@ -4156,7 +4194,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
22536                 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
22537
22538         ret = !list_empty(&pwq->delayed_works);
22539 -       rcu_read_unlock_sched();
22540 +       preempt_enable();
22541 +       rcu_read_unlock();
22542
22543         return ret;
22544  }
22545 @@ -4182,15 +4221,15 @@ unsigned int work_busy(struct work_struct *work)
22546         if (work_pending(work))
22547                 ret |= WORK_BUSY_PENDING;
22548
22549 -       local_irq_save(flags);
22550 +       rcu_read_lock();
22551         pool = get_work_pool(work);
22552         if (pool) {
22553 -               spin_lock(&pool->lock);
22554 +               spin_lock_irqsave(&pool->lock, flags);
22555                 if (find_worker_executing_work(pool, work))
22556                         ret |= WORK_BUSY_RUNNING;
22557 -               spin_unlock(&pool->lock);
22558 +               spin_unlock_irqrestore(&pool->lock, flags);
22559         }
22560 -       local_irq_restore(flags);
22561 +       rcu_read_unlock();
22562
22563         return ret;
22564  }
22565 @@ -4379,7 +4418,7 @@ void show_workqueue_state(void)
22566         unsigned long flags;
22567         int pi;
22568
22569 -       rcu_read_lock_sched();
22570 +       rcu_read_lock();
22571
22572         pr_info("Showing busy workqueues and worker pools:\n");
22573
22574 @@ -4432,7 +4471,7 @@ void show_workqueue_state(void)
22575                 spin_unlock_irqrestore(&pool->lock, flags);
22576         }
22577
22578 -       rcu_read_unlock_sched();
22579 +       rcu_read_unlock();
22580  }
22581
22582  /*
22583 @@ -4770,16 +4809,16 @@ bool freeze_workqueues_busy(void)
22584                  * nr_active is monotonically decreasing.  It's safe
22585                  * to peek without lock.
22586                  */
22587 -               rcu_read_lock_sched();
22588 +               rcu_read_lock();
22589                 for_each_pwq(pwq, wq) {
22590                         WARN_ON_ONCE(pwq->nr_active < 0);
22591                         if (pwq->nr_active) {
22592                                 busy = true;
22593 -                               rcu_read_unlock_sched();
22594 +                               rcu_read_unlock();
22595                                 goto out_unlock;
22596                         }
22597                 }
22598 -               rcu_read_unlock_sched();
22599 +               rcu_read_unlock();
22600         }
22601  out_unlock:
22602         mutex_unlock(&wq_pool_mutex);
22603 @@ -4969,7 +5008,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
22604         const char *delim = "";
22605         int node, written = 0;
22606
22607 -       rcu_read_lock_sched();
22608 +       get_online_cpus();
22609 +       rcu_read_lock();
22610         for_each_node(node) {
22611                 written += scnprintf(buf + written, PAGE_SIZE - written,
22612                                      "%s%d:%d", delim, node,
22613 @@ -4977,7 +5017,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
22614                 delim = " ";
22615         }
22616         written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
22617 -       rcu_read_unlock_sched();
22618 +       rcu_read_unlock();
22619 +       put_online_cpus();
22620
22621         return written;
22622  }
22623 diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
22624 index 8635417c587b..f000c4d6917e 100644
22625 --- a/kernel/workqueue_internal.h
22626 +++ b/kernel/workqueue_internal.h
22627 @@ -43,6 +43,7 @@ struct worker {
22628         unsigned long           last_active;    /* L: last active timestamp */
22629         unsigned int            flags;          /* X: flags */
22630         int                     id;             /* I: worker id */
22631 +       int                     sleeping;       /* None */
22632
22633         /*
22634          * Opaque string set with work_set_desc().  Printed out with task
22635 @@ -68,7 +69,7 @@ static inline struct worker *current_wq_worker(void)
22636   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
22637   * sched/core.c and workqueue.c.
22638   */
22639 -void wq_worker_waking_up(struct task_struct *task, int cpu);
22640 -struct task_struct *wq_worker_sleeping(struct task_struct *task);
22641 +void wq_worker_running(struct task_struct *task);
22642 +void wq_worker_sleeping(struct task_struct *task);
22643
22644  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
22645 diff --git a/lib/Kconfig b/lib/Kconfig
22646 index d79909dc01ec..fd2accb2f2bb 100644
22647 --- a/lib/Kconfig
22648 +++ b/lib/Kconfig
22649 @@ -400,6 +400,7 @@ config CHECK_SIGNATURE
22650
22651  config CPUMASK_OFFSTACK
22652         bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
22653 +       depends on !PREEMPT_RT_FULL
22654         help
22655           Use dynamic allocation for cpumask_var_t, instead of putting
22656           them on the stack.  This is a bit more expensive, but avoids
22657 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
22658 index cab7405f48d2..dbc49c48ff53 100644
22659 --- a/lib/Kconfig.debug
22660 +++ b/lib/Kconfig.debug
22661 @@ -977,6 +977,7 @@ config TIMER_STATS
22662  config DEBUG_PREEMPT
22663         bool "Debug preemptible kernel"
22664         depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT
22665 +       select USING_GET_LOCK_PARENT_IP
22666         default y
22667         help
22668           If you say Y here then the kernel will use a debug variant of the
22669 @@ -1159,8 +1160,17 @@ config LOCK_TORTURE_TEST
22670
22671  endmenu # lock debugging
22672
22673 +config USING_GET_LOCK_PARENT_IP
22674 +        bool
22675 +       help
22676 +         Enables the use of the function get_lock_parent_ip() that
22677 +         will use __builtin_return_address(n) with n > 0 causing
22678 +         some gcc warnings. When this is selected, those warnings
22679 +         will be suppressed.
22680 +
22681  config TRACE_IRQFLAGS
22682         bool
22683 +       select USING_GET_LOCK_PARENT_IP
22684         help
22685           Enables hooks to interrupt enabling and disabling for
22686           either tracing or lock debugging.
22687 diff --git a/lib/debugobjects.c b/lib/debugobjects.c
22688 index a8e12601eb37..c76d5f0beafe 100644
22689 --- a/lib/debugobjects.c
22690 +++ b/lib/debugobjects.c
22691 @@ -308,7 +308,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
22692         struct debug_obj *obj;
22693         unsigned long flags;
22694
22695 -       fill_pool();
22696 +#ifdef CONFIG_PREEMPT_RT_FULL
22697 +       if (preempt_count() == 0 && !irqs_disabled())
22698 +#endif
22699 +               fill_pool();
22700
22701         db = get_bucket((unsigned long) addr);
22702
22703 diff --git a/lib/idr.c b/lib/idr.c
22704 index 6098336df267..9decbe914595 100644
22705 --- a/lib/idr.c
22706 +++ b/lib/idr.c
22707 @@ -30,6 +30,7 @@
22708  #include <linux/idr.h>
22709  #include <linux/spinlock.h>
22710  #include <linux/percpu.h>
22711 +#include <linux/locallock.h>
22712
22713  #define MAX_IDR_SHIFT          (sizeof(int) * 8 - 1)
22714  #define MAX_IDR_BIT            (1U << MAX_IDR_SHIFT)
22715 @@ -45,6 +46,37 @@ static DEFINE_PER_CPU(struct idr_layer *, idr_preload_head);
22716  static DEFINE_PER_CPU(int, idr_preload_cnt);
22717  static DEFINE_SPINLOCK(simple_ida_lock);
22718
22719 +#ifdef CONFIG_PREEMPT_RT_FULL
22720 +static DEFINE_LOCAL_IRQ_LOCK(idr_lock);
22721 +
22722 +static inline void idr_preload_lock(void)
22723 +{
22724 +       local_lock(idr_lock);
22725 +}
22726 +
22727 +static inline void idr_preload_unlock(void)
22728 +{
22729 +       local_unlock(idr_lock);
22730 +}
22731 +
22732 +void idr_preload_end(void)
22733 +{
22734 +       idr_preload_unlock();
22735 +}
22736 +EXPORT_SYMBOL(idr_preload_end);
22737 +#else
22738 +static inline void idr_preload_lock(void)
22739 +{
22740 +       preempt_disable();
22741 +}
22742 +
22743 +static inline void idr_preload_unlock(void)
22744 +{
22745 +       preempt_enable();
22746 +}
22747 +#endif
22748 +
22749 +
22750  /* the maximum ID which can be allocated given idr->layers */
22751  static int idr_max(int layers)
22752  {
22753 @@ -115,14 +147,14 @@ static struct idr_layer *idr_layer_alloc(gfp_t gfp_mask, struct idr *layer_idr)
22754          * context.  See idr_preload() for details.
22755          */
22756         if (!in_interrupt()) {
22757 -               preempt_disable();
22758 +               idr_preload_lock();
22759                 new = __this_cpu_read(idr_preload_head);
22760                 if (new) {
22761                         __this_cpu_write(idr_preload_head, new->ary[0]);
22762                         __this_cpu_dec(idr_preload_cnt);
22763                         new->ary[0] = NULL;
22764                 }
22765 -               preempt_enable();
22766 +               idr_preload_unlock();
22767                 if (new)
22768                         return new;
22769         }
22770 @@ -366,7 +398,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id,
22771         idr_mark_full(pa, id);
22772  }
22773
22774 -
22775  /**
22776   * idr_preload - preload for idr_alloc()
22777   * @gfp_mask: allocation mask to use for preloading
22778 @@ -401,7 +432,7 @@ void idr_preload(gfp_t gfp_mask)
22779         WARN_ON_ONCE(in_interrupt());
22780         might_sleep_if(gfpflags_allow_blocking(gfp_mask));
22781
22782 -       preempt_disable();
22783 +       idr_preload_lock();
22784
22785         /*
22786          * idr_alloc() is likely to succeed w/o full idr_layer buffer and
22787 @@ -413,9 +444,9 @@ void idr_preload(gfp_t gfp_mask)
22788         while (__this_cpu_read(idr_preload_cnt) < MAX_IDR_FREE) {
22789                 struct idr_layer *new;
22790
22791 -               preempt_enable();
22792 +               idr_preload_unlock();
22793                 new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
22794 -               preempt_disable();
22795 +               idr_preload_lock();
22796                 if (!new)
22797                         break;
22798
22799 diff --git a/lib/irq_poll.c b/lib/irq_poll.c
22800 index 836f7db4e548..709d4eed1df9 100644
22801 --- a/lib/irq_poll.c
22802 +++ b/lib/irq_poll.c
22803 @@ -36,6 +36,7 @@ void irq_poll_sched(struct irq_poll *iop)
22804         list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
22805         __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
22806         local_irq_restore(flags);
22807 +       preempt_check_resched_rt();
22808  }
22809  EXPORT_SYMBOL(irq_poll_sched);
22810
22811 @@ -71,6 +72,7 @@ void irq_poll_complete(struct irq_poll *iop)
22812         local_irq_save(flags);
22813         __irq_poll_complete(iop);
22814         local_irq_restore(flags);
22815 +       preempt_check_resched_rt();
22816  }
22817  EXPORT_SYMBOL(irq_poll_complete);
22818
22819 @@ -95,6 +97,7 @@ static void irq_poll_softirq(struct softirq_action *h)
22820                 }
22821
22822                 local_irq_enable();
22823 +               preempt_check_resched_rt();
22824
22825                 /* Even though interrupts have been re-enabled, this
22826                  * access is safe because interrupts can only add new
22827 @@ -132,6 +135,7 @@ static void irq_poll_softirq(struct softirq_action *h)
22828                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
22829
22830         local_irq_enable();
22831 +       preempt_check_resched_rt();
22832  }
22833
22834  /**
22835 @@ -199,6 +203,7 @@ static int irq_poll_cpu_notify(struct notifier_block *self,
22836                                  this_cpu_ptr(&blk_cpu_iopoll));
22837                 __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ);
22838                 local_irq_enable();
22839 +               preempt_check_resched_rt();
22840         }
22841
22842         return NOTIFY_OK;
22843 diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
22844 index 872a15a2a637..b93a6103fa4d 100644
22845 --- a/lib/locking-selftest.c
22846 +++ b/lib/locking-selftest.c
22847 @@ -590,6 +590,8 @@ GENERATE_TESTCASE(init_held_rsem)
22848  #include "locking-selftest-spin-hardirq.h"
22849  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin)
22850
22851 +#ifndef CONFIG_PREEMPT_RT_FULL
22852 +
22853  #include "locking-selftest-rlock-hardirq.h"
22854  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock)
22855
22856 @@ -605,9 +607,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock)
22857  #include "locking-selftest-wlock-softirq.h"
22858  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock)
22859
22860 +#endif
22861 +
22862  #undef E1
22863  #undef E2
22864
22865 +#ifndef CONFIG_PREEMPT_RT_FULL
22866  /*
22867   * Enabling hardirqs with a softirq-safe lock held:
22868   */
22869 @@ -640,6 +645,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
22870  #undef E1
22871  #undef E2
22872
22873 +#endif
22874 +
22875  /*
22876   * Enabling irqs with an irq-safe lock held:
22877   */
22878 @@ -663,6 +670,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock)
22879  #include "locking-selftest-spin-hardirq.h"
22880  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin)
22881
22882 +#ifndef CONFIG_PREEMPT_RT_FULL
22883 +
22884  #include "locking-selftest-rlock-hardirq.h"
22885  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock)
22886
22887 @@ -678,6 +687,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock)
22888  #include "locking-selftest-wlock-softirq.h"
22889  GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
22890
22891 +#endif
22892 +
22893  #undef E1
22894  #undef E2
22895
22896 @@ -709,6 +720,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock)
22897  #include "locking-selftest-spin-hardirq.h"
22898  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin)
22899
22900 +#ifndef CONFIG_PREEMPT_RT_FULL
22901 +
22902  #include "locking-selftest-rlock-hardirq.h"
22903  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock)
22904
22905 @@ -724,6 +737,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock)
22906  #include "locking-selftest-wlock-softirq.h"
22907  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
22908
22909 +#endif
22910 +
22911  #undef E1
22912  #undef E2
22913  #undef E3
22914 @@ -757,6 +772,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock)
22915  #include "locking-selftest-spin-hardirq.h"
22916  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin)
22917
22918 +#ifndef CONFIG_PREEMPT_RT_FULL
22919 +
22920  #include "locking-selftest-rlock-hardirq.h"
22921  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock)
22922
22923 @@ -772,10 +789,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock)
22924  #include "locking-selftest-wlock-softirq.h"
22925  GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock)
22926
22927 +#endif
22928 +
22929  #undef E1
22930  #undef E2
22931  #undef E3
22932
22933 +#ifndef CONFIG_PREEMPT_RT_FULL
22934 +
22935  /*
22936   * read-lock / write-lock irq inversion.
22937   *
22938 @@ -838,6 +859,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock)
22939  #undef E2
22940  #undef E3
22941
22942 +#endif
22943 +
22944 +#ifndef CONFIG_PREEMPT_RT_FULL
22945 +
22946  /*
22947   * read-lock / write-lock recursion that is actually safe.
22948   */
22949 @@ -876,6 +901,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft)
22950  #undef E2
22951  #undef E3
22952
22953 +#endif
22954 +
22955  /*
22956   * read-lock / write-lock recursion that is unsafe.
22957   */
22958 @@ -1858,6 +1885,7 @@ void locking_selftest(void)
22959
22960         printk("  --------------------------------------------------------------------------\n");
22961
22962 +#ifndef CONFIG_PREEMPT_RT_FULL
22963         /*
22964          * irq-context testcases:
22965          */
22966 @@ -1870,6 +1898,28 @@ void locking_selftest(void)
22967
22968         DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion);
22969  //     DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2);
22970 +#else
22971 +       /* On -rt, we only do hardirq context test for raw spinlock */
22972 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12);
22973 +       DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21);
22974 +
22975 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12);
22976 +       DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21);
22977 +
22978 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123);
22979 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132);
22980 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213);
22981 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231);
22982 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312);
22983 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321);
22984 +
22985 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123);
22986 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132);
22987 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213);
22988 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231);
22989 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312);
22990 +       DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321);
22991 +#endif
22992
22993         ww_tests();
22994
22995 diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
22996 index 6d40944960de..822a2c027e72 100644
22997 --- a/lib/percpu_ida.c
22998 +++ b/lib/percpu_ida.c
22999 @@ -26,6 +26,9 @@
23000  #include <linux/string.h>
23001  #include <linux/spinlock.h>
23002  #include <linux/percpu_ida.h>
23003 +#include <linux/locallock.h>
23004 +
23005 +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock);
23006
23007  struct percpu_ida_cpu {
23008         /*
23009 @@ -148,13 +151,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23010         unsigned long flags;
23011         int tag;
23012
23013 -       local_irq_save(flags);
23014 +       local_lock_irqsave(irq_off_lock, flags);
23015         tags = this_cpu_ptr(pool->tag_cpu);
23016
23017         /* Fastpath */
23018         tag = alloc_local_tag(tags);
23019         if (likely(tag >= 0)) {
23020 -               local_irq_restore(flags);
23021 +               local_unlock_irqrestore(irq_off_lock, flags);
23022                 return tag;
23023         }
23024
23025 @@ -173,6 +176,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23026
23027                 if (!tags->nr_free)
23028                         alloc_global_tags(pool, tags);
23029 +
23030                 if (!tags->nr_free)
23031                         steal_tags(pool, tags);
23032
23033 @@ -184,7 +188,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23034                 }
23035
23036                 spin_unlock(&pool->lock);
23037 -               local_irq_restore(flags);
23038 +               local_unlock_irqrestore(irq_off_lock, flags);
23039
23040                 if (tag >= 0 || state == TASK_RUNNING)
23041                         break;
23042 @@ -196,7 +200,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state)
23043
23044                 schedule();
23045
23046 -               local_irq_save(flags);
23047 +               local_lock_irqsave(irq_off_lock, flags);
23048                 tags = this_cpu_ptr(pool->tag_cpu);
23049         }
23050         if (state != TASK_RUNNING)
23051 @@ -221,7 +225,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
23052
23053         BUG_ON(tag >= pool->nr_tags);
23054
23055 -       local_irq_save(flags);
23056 +       local_lock_irqsave(irq_off_lock, flags);
23057         tags = this_cpu_ptr(pool->tag_cpu);
23058
23059         spin_lock(&tags->lock);
23060 @@ -253,7 +257,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
23061                 spin_unlock(&pool->lock);
23062         }
23063
23064 -       local_irq_restore(flags);
23065 +       local_unlock_irqrestore(irq_off_lock, flags);
23066  }
23067  EXPORT_SYMBOL_GPL(percpu_ida_free);
23068
23069 @@ -345,7 +349,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
23070         struct percpu_ida_cpu *remote;
23071         unsigned cpu, i, err = 0;
23072
23073 -       local_irq_save(flags);
23074 +       local_lock_irqsave(irq_off_lock, flags);
23075         for_each_possible_cpu(cpu) {
23076                 remote = per_cpu_ptr(pool->tag_cpu, cpu);
23077                 spin_lock(&remote->lock);
23078 @@ -367,7 +371,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn,
23079         }
23080         spin_unlock(&pool->lock);
23081  out:
23082 -       local_irq_restore(flags);
23083 +       local_unlock_irqrestore(irq_off_lock, flags);
23084         return err;
23085  }
23086  EXPORT_SYMBOL_GPL(percpu_ida_for_each_free);
23087 diff --git a/lib/radix-tree.c b/lib/radix-tree.c
23088 index 8e6d552c40dd..881cc195d85f 100644
23089 --- a/lib/radix-tree.c
23090 +++ b/lib/radix-tree.c
23091 @@ -290,13 +290,14 @@ radix_tree_node_alloc(struct radix_tree_root *root)
23092                  * succeed in getting a node here (and never reach
23093                  * kmem_cache_alloc)
23094                  */
23095 -               rtp = this_cpu_ptr(&radix_tree_preloads);
23096 +               rtp = &get_cpu_var(radix_tree_preloads);
23097                 if (rtp->nr) {
23098                         ret = rtp->nodes;
23099                         rtp->nodes = ret->private_data;
23100                         ret->private_data = NULL;
23101                         rtp->nr--;
23102                 }
23103 +               put_cpu_var(radix_tree_preloads);
23104                 /*
23105                  * Update the allocation stack trace as this is more useful
23106                  * for debugging.
23107 @@ -336,6 +337,7 @@ radix_tree_node_free(struct radix_tree_node *node)
23108         call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
23109  }
23110
23111 +#ifndef CONFIG_PREEMPT_RT_FULL
23112  /*
23113   * Load up this CPU's radix_tree_node buffer with sufficient objects to
23114   * ensure that the addition of a single element in the tree cannot fail.  On
23115 @@ -455,6 +457,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
23116
23117         return __radix_tree_preload(gfp_mask, nr_nodes);
23118  }
23119 +#endif
23120
23121  /*
23122   * The maximum index which can be stored in a radix tree
23123 diff --git a/lib/scatterlist.c b/lib/scatterlist.c
23124 index 004fc70fc56a..ccc46992a517 100644
23125 --- a/lib/scatterlist.c
23126 +++ b/lib/scatterlist.c
23127 @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter)
23128                         flush_kernel_dcache_page(miter->page);
23129
23130                 if (miter->__flags & SG_MITER_ATOMIC) {
23131 -                       WARN_ON_ONCE(preemptible());
23132 +                       WARN_ON_ONCE(!pagefault_disabled());
23133                         kunmap_atomic(miter->addr);
23134                 } else
23135                         kunmap(miter->page);
23136 @@ -664,7 +664,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
23137         if (!sg_miter_skip(&miter, skip))
23138                 return false;
23139
23140 -       local_irq_save(flags);
23141 +       local_irq_save_nort(flags);
23142
23143         while (sg_miter_next(&miter) && offset < buflen) {
23144                 unsigned int len;
23145 @@ -681,7 +681,7 @@ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf,
23146
23147         sg_miter_stop(&miter);
23148
23149 -       local_irq_restore(flags);
23150 +       local_irq_restore_nort(flags);
23151         return offset;
23152  }
23153  EXPORT_SYMBOL(sg_copy_buffer);
23154 diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
23155 index 1afec32de6f2..11fa431046a8 100644
23156 --- a/lib/smp_processor_id.c
23157 +++ b/lib/smp_processor_id.c
23158 @@ -39,8 +39,9 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
23159         if (!printk_ratelimit())
23160                 goto out_enable;
23161
23162 -       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
23163 -               what1, what2, preempt_count() - 1, current->comm, current->pid);
23164 +       printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x %08x] code: %s/%d\n",
23165 +               what1, what2, preempt_count() - 1, __migrate_disabled(current),
23166 +               current->comm, current->pid);
23167
23168         print_symbol("caller is %s\n", (long)__builtin_return_address(0));
23169         dump_stack();
23170 diff --git a/localversion-rt b/localversion-rt
23171 new file mode 100644
23172 index 000000000000..0efe7ba1930e
23173 --- /dev/null
23174 +++ b/localversion-rt
23175 @@ -0,0 +1 @@
23176 +-rt5
23177 diff --git a/mm/Kconfig b/mm/Kconfig
23178 index be0ee11fa0d9..fe2857d67973 100644
23179 --- a/mm/Kconfig
23180 +++ b/mm/Kconfig
23181 @@ -410,7 +410,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
23182
23183  config TRANSPARENT_HUGEPAGE
23184         bool "Transparent Hugepage Support"
23185 -       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
23186 +       depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL
23187         select COMPACTION
23188         select RADIX_TREE_MULTIORDER
23189         help
23190 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
23191 index 8fde443f36d7..d7a863b0ec20 100644
23192 --- a/mm/backing-dev.c
23193 +++ b/mm/backing-dev.c
23194 @@ -457,9 +457,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
23195  {
23196         unsigned long flags;
23197
23198 -       local_irq_save(flags);
23199 +       local_irq_save_nort(flags);
23200         if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
23201 -               local_irq_restore(flags);
23202 +               local_irq_restore_nort(flags);
23203                 return;
23204         }
23205
23206 diff --git a/mm/compaction.c b/mm/compaction.c
23207 index 9affb2908304..d5eb0e52e96f 100644
23208 --- a/mm/compaction.c
23209 +++ b/mm/compaction.c
23210 @@ -1585,10 +1585,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
23211                                 block_start_pfn(cc->migrate_pfn, cc->order);
23212
23213                         if (cc->last_migrated_pfn < current_block_start) {
23214 -                               cpu = get_cpu();
23215 +                               cpu = get_cpu_light();
23216 +                               local_lock_irq(swapvec_lock);
23217                                 lru_add_drain_cpu(cpu);
23218 +                               local_unlock_irq(swapvec_lock);
23219                                 drain_local_pages(zone);
23220 -                               put_cpu();
23221 +                               put_cpu_light();
23222                                 /* No more flushing until we migrate again */
23223                                 cc->last_migrated_pfn = 0;
23224                         }
23225 diff --git a/mm/filemap.c b/mm/filemap.c
23226 index ced9ef6c06b0..19f6f0d77604 100644
23227 --- a/mm/filemap.c
23228 +++ b/mm/filemap.c
23229 @@ -159,9 +159,12 @@ static int page_cache_tree_insert(struct address_space *mapping,
23230                  * node->private_list is protected by
23231                  * mapping->tree_lock.
23232                  */
23233 -               if (!list_empty(&node->private_list))
23234 -                       list_lru_del(&workingset_shadow_nodes,
23235 +               if (!list_empty(&node->private_list)) {
23236 +                       local_lock(workingset_shadow_lock);
23237 +                       list_lru_del(&__workingset_shadow_nodes,
23238                                      &node->private_list);
23239 +                       local_unlock(workingset_shadow_lock);
23240 +               }
23241         }
23242         return 0;
23243  }
23244 @@ -217,8 +220,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
23245                 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
23246                                 list_empty(&node->private_list)) {
23247                         node->private_data = mapping;
23248 -                       list_lru_add(&workingset_shadow_nodes,
23249 -                                       &node->private_list);
23250 +                       local_lock(workingset_shadow_lock);
23251 +                       list_lru_add(&__workingset_shadow_nodes,
23252 +                                    &node->private_list);
23253 +                       local_unlock(workingset_shadow_lock);
23254                 }
23255         }
23256
23257 diff --git a/mm/highmem.c b/mm/highmem.c
23258 index 50b4ca6787f0..77518a3b35a1 100644
23259 --- a/mm/highmem.c
23260 +++ b/mm/highmem.c
23261 @@ -29,10 +29,11 @@
23262  #include <linux/kgdb.h>
23263  #include <asm/tlbflush.h>
23264
23265 -
23266 +#ifndef CONFIG_PREEMPT_RT_FULL
23267  #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
23268  DEFINE_PER_CPU(int, __kmap_atomic_idx);
23269  #endif
23270 +#endif
23271
23272  /*
23273   * Virtual_count is not a pure "count".
23274 @@ -107,8 +108,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
23275  unsigned long totalhigh_pages __read_mostly;
23276  EXPORT_SYMBOL(totalhigh_pages);
23277
23278 -
23279 +#ifndef CONFIG_PREEMPT_RT_FULL
23280  EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
23281 +#endif
23282
23283  unsigned int nr_free_highpages (void)
23284  {
23285 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
23286 index 4be518d4e68a..724240ca2f35 100644
23287 --- a/mm/memcontrol.c
23288 +++ b/mm/memcontrol.c
23289 @@ -67,6 +67,7 @@
23290  #include <net/sock.h>
23291  #include <net/ip.h>
23292  #include "slab.h"
23293 +#include <linux/locallock.h>
23294
23295  #include <asm/uaccess.h>
23296
23297 @@ -92,6 +93,8 @@ int do_swap_account __read_mostly;
23298  #define do_swap_account                0
23299  #endif
23300
23301 +static DEFINE_LOCAL_IRQ_LOCK(event_lock);
23302 +
23303  /* Whether legacy memory+swap accounting is active */
23304  static bool do_memsw_account(void)
23305  {
23306 @@ -1724,6 +1727,7 @@ struct memcg_stock_pcp {
23307  #define FLUSHING_CACHED_CHARGE 0
23308  };
23309  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
23310 +static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
23311  static DEFINE_MUTEX(percpu_charge_mutex);
23312
23313  /**
23314 @@ -1746,7 +1750,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23315         if (nr_pages > CHARGE_BATCH)
23316                 return ret;
23317
23318 -       local_irq_save(flags);
23319 +       local_lock_irqsave(memcg_stock_ll, flags);
23320
23321         stock = this_cpu_ptr(&memcg_stock);
23322         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
23323 @@ -1754,7 +1758,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23324                 ret = true;
23325         }
23326
23327 -       local_irq_restore(flags);
23328 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23329
23330         return ret;
23331  }
23332 @@ -1781,13 +1785,13 @@ static void drain_local_stock(struct work_struct *dummy)
23333         struct memcg_stock_pcp *stock;
23334         unsigned long flags;
23335
23336 -       local_irq_save(flags);
23337 +       local_lock_irqsave(memcg_stock_ll, flags);
23338
23339         stock = this_cpu_ptr(&memcg_stock);
23340         drain_stock(stock);
23341         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
23342
23343 -       local_irq_restore(flags);
23344 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23345  }
23346
23347  /*
23348 @@ -1799,7 +1803,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23349         struct memcg_stock_pcp *stock;
23350         unsigned long flags;
23351
23352 -       local_irq_save(flags);
23353 +       local_lock_irqsave(memcg_stock_ll, flags);
23354
23355         stock = this_cpu_ptr(&memcg_stock);
23356         if (stock->cached != memcg) { /* reset if necessary */
23357 @@ -1808,7 +1812,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
23358         }
23359         stock->nr_pages += nr_pages;
23360
23361 -       local_irq_restore(flags);
23362 +       local_unlock_irqrestore(memcg_stock_ll, flags);
23363  }
23364
23365  /*
23366 @@ -1824,7 +1828,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
23367                 return;
23368         /* Notify other cpus that system-wide "drain" is running */
23369         get_online_cpus();
23370 -       curcpu = get_cpu();
23371 +       curcpu = get_cpu_light();
23372         for_each_online_cpu(cpu) {
23373                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
23374                 struct mem_cgroup *memcg;
23375 @@ -1841,7 +1845,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
23376                                 schedule_work_on(cpu, &stock->work);
23377                 }
23378         }
23379 -       put_cpu();
23380 +       put_cpu_light();
23381         put_online_cpus();
23382         mutex_unlock(&percpu_charge_mutex);
23383  }
23384 @@ -4566,12 +4570,12 @@ static int mem_cgroup_move_account(struct page *page,
23385
23386         ret = 0;
23387
23388 -       local_irq_disable();
23389 +       local_lock_irq(event_lock);
23390         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
23391         memcg_check_events(to, page);
23392         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
23393         memcg_check_events(from, page);
23394 -       local_irq_enable();
23395 +       local_unlock_irq(event_lock);
23396  out_unlock:
23397         unlock_page(page);
23398  out:
23399 @@ -5444,10 +5448,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
23400
23401         commit_charge(page, memcg, lrucare);
23402
23403 -       local_irq_disable();
23404 +       local_lock_irq(event_lock);
23405         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
23406         memcg_check_events(memcg, page);
23407 -       local_irq_enable();
23408 +       local_unlock_irq(event_lock);
23409
23410         if (do_memsw_account() && PageSwapCache(page)) {
23411                 swp_entry_t entry = { .val = page_private(page) };
23412 @@ -5503,14 +5507,14 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
23413                 memcg_oom_recover(memcg);
23414         }
23415
23416 -       local_irq_save(flags);
23417 +       local_lock_irqsave(event_lock, flags);
23418         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
23419         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
23420         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
23421         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
23422         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
23423         memcg_check_events(memcg, dummy_page);
23424 -       local_irq_restore(flags);
23425 +       local_unlock_irqrestore(event_lock, flags);
23426
23427         if (!mem_cgroup_is_root(memcg))
23428                 css_put_many(&memcg->css, nr_pages);
23429 @@ -5665,10 +5669,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
23430
23431         commit_charge(newpage, memcg, false);
23432
23433 -       local_irq_save(flags);
23434 +       local_lock_irqsave(event_lock, flags);
23435         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
23436         memcg_check_events(memcg, newpage);
23437 -       local_irq_restore(flags);
23438 +       local_unlock_irqrestore(event_lock, flags);
23439  }
23440
23441  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
23442 @@ -5845,6 +5849,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
23443  {
23444         struct mem_cgroup *memcg, *swap_memcg;
23445         unsigned short oldid;
23446 +       unsigned long flags;
23447
23448         VM_BUG_ON_PAGE(PageLRU(page), page);
23449         VM_BUG_ON_PAGE(page_count(page), page);
23450 @@ -5885,12 +5890,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
23451          * important here to have the interrupts disabled because it is the
23452          * only synchronisation we have for udpating the per-CPU variables.
23453          */
23454 +       local_lock_irqsave(event_lock, flags);
23455 +#ifndef CONFIG_PREEMPT_RT_BASE
23456         VM_BUG_ON(!irqs_disabled());
23457 +#endif
23458         mem_cgroup_charge_statistics(memcg, page, false, -1);
23459         memcg_check_events(memcg, page);
23460
23461         if (!mem_cgroup_is_root(memcg))
23462                 css_put(&memcg->css);
23463 +       local_unlock_irqrestore(event_lock, flags);
23464  }
23465
23466  /*
23467 diff --git a/mm/mmu_context.c b/mm/mmu_context.c
23468 index 6f4d27c5bb32..5cd25c745a8f 100644
23469 --- a/mm/mmu_context.c
23470 +++ b/mm/mmu_context.c
23471 @@ -23,6 +23,7 @@ void use_mm(struct mm_struct *mm)
23472         struct task_struct *tsk = current;
23473
23474         task_lock(tsk);
23475 +       preempt_disable_rt();
23476         active_mm = tsk->active_mm;
23477         if (active_mm != mm) {
23478                 atomic_inc(&mm->mm_count);
23479 @@ -30,6 +31,7 @@ void use_mm(struct mm_struct *mm)
23480         }
23481         tsk->mm = mm;
23482         switch_mm(active_mm, mm, tsk);
23483 +       preempt_enable_rt();
23484         task_unlock(tsk);
23485  #ifdef finish_arch_post_lock_switch
23486         finish_arch_post_lock_switch();
23487 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
23488 index a2214c64ed3c..4be4d5d66f73 100644
23489 --- a/mm/page_alloc.c
23490 +++ b/mm/page_alloc.c
23491 @@ -61,6 +61,7 @@
23492  #include <linux/page_ext.h>
23493  #include <linux/hugetlb.h>
23494  #include <linux/sched/rt.h>
23495 +#include <linux/locallock.h>
23496  #include <linux/page_owner.h>
23497  #include <linux/kthread.h>
23498  #include <linux/memcontrol.h>
23499 @@ -276,6 +277,18 @@ EXPORT_SYMBOL(nr_node_ids);
23500  EXPORT_SYMBOL(nr_online_nodes);
23501  #endif
23502
23503 +static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
23504 +
23505 +#ifdef CONFIG_PREEMPT_RT_BASE
23506 +# define cpu_lock_irqsave(cpu, flags)          \
23507 +       local_lock_irqsave_on(pa_lock, flags, cpu)
23508 +# define cpu_unlock_irqrestore(cpu, flags)     \
23509 +       local_unlock_irqrestore_on(pa_lock, flags, cpu)
23510 +#else
23511 +# define cpu_lock_irqsave(cpu, flags)          local_irq_save(flags)
23512 +# define cpu_unlock_irqrestore(cpu, flags)     local_irq_restore(flags)
23513 +#endif
23514 +
23515  int page_group_by_mobility_disabled __read_mostly;
23516
23517  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
23518 @@ -1056,7 +1069,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
23519  #endif /* CONFIG_DEBUG_VM */
23520
23521  /*
23522 - * Frees a number of pages from the PCP lists
23523 + * Frees a number of pages which have been collected from the pcp lists.
23524   * Assumes all pages on list are in same zone, and of same order.
23525   * count is the number of pages to free.
23526   *
23527 @@ -1067,19 +1080,58 @@ static bool bulkfree_pcp_prepare(struct page *page)
23528   * pinned" detection logic.
23529   */
23530  static void free_pcppages_bulk(struct zone *zone, int count,
23531 -                                       struct per_cpu_pages *pcp)
23532 +                              struct list_head *list)
23533  {
23534 -       int migratetype = 0;
23535 -       int batch_free = 0;
23536         unsigned long nr_scanned;
23537         bool isolated_pageblocks;
23538 +       unsigned long flags;
23539 +
23540 +       spin_lock_irqsave(&zone->lock, flags);
23541
23542 -       spin_lock(&zone->lock);
23543         isolated_pageblocks = has_isolate_pageblock(zone);
23544         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
23545         if (nr_scanned)
23546                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
23547
23548 +       while (!list_empty(list)) {
23549 +               struct page *page;
23550 +               int mt; /* migratetype of the to-be-freed page */
23551 +
23552 +               page = list_first_entry(list, struct page, lru);
23553 +               /* must delete as __free_one_page list manipulates */
23554 +               list_del(&page->lru);
23555 +
23556 +               mt = get_pcppage_migratetype(page);
23557 +               /* MIGRATE_ISOLATE page should not go to pcplists */
23558 +               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
23559 +               /* Pageblock could have been isolated meanwhile */
23560 +               if (unlikely(isolated_pageblocks))
23561 +                       mt = get_pageblock_migratetype(page);
23562 +
23563 +               if (bulkfree_pcp_prepare(page))
23564 +                       continue;
23565 +
23566 +               __free_one_page(page, page_to_pfn(page), zone, 0, mt);
23567 +               trace_mm_page_pcpu_drain(page, 0, mt);
23568 +               count--;
23569 +       }
23570 +       WARN_ON(count != 0);
23571 +       spin_unlock_irqrestore(&zone->lock, flags);
23572 +}
23573 +
23574 +/*
23575 + * Moves a number of pages from the PCP lists to free list which
23576 + * is freed outside of the locked region.
23577 + *
23578 + * Assumes all pages on list are in same zone, and of same order.
23579 + * count is the number of pages to free.
23580 + */
23581 +static void isolate_pcp_pages(int count, struct per_cpu_pages *src,
23582 +                             struct list_head *dst)
23583 +{
23584 +       int migratetype = 0;
23585 +       int batch_free = 0;
23586 +
23587         while (count) {
23588                 struct page *page;
23589                 struct list_head *list;
23590 @@ -1095,7 +1147,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
23591                         batch_free++;
23592                         if (++migratetype == MIGRATE_PCPTYPES)
23593                                 migratetype = 0;
23594 -                       list = &pcp->lists[migratetype];
23595 +                       list = &src->lists[migratetype];
23596                 } while (list_empty(list));
23597
23598                 /* This is the only non-empty list. Free them all. */
23599 @@ -1103,27 +1155,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
23600                         batch_free = count;
23601
23602                 do {
23603 -                       int mt; /* migratetype of the to-be-freed page */
23604 -
23605                         page = list_last_entry(list, struct page, lru);
23606 -                       /* must delete as __free_one_page list manipulates */
23607                         list_del(&page->lru);
23608
23609 -                       mt = get_pcppage_migratetype(page);
23610 -                       /* MIGRATE_ISOLATE page should not go to pcplists */
23611 -                       VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
23612 -                       /* Pageblock could have been isolated meanwhile */
23613 -                       if (unlikely(isolated_pageblocks))
23614 -                               mt = get_pageblock_migratetype(page);
23615 -
23616 -                       if (bulkfree_pcp_prepare(page))
23617 -                               continue;
23618 -
23619 -                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
23620 -                       trace_mm_page_pcpu_drain(page, 0, mt);
23621 +                       list_add(&page->lru, dst);
23622                 } while (--count && --batch_free && !list_empty(list));
23623         }
23624 -       spin_unlock(&zone->lock);
23625  }
23626
23627  static void free_one_page(struct zone *zone,
23628 @@ -1132,7 +1169,9 @@ static void free_one_page(struct zone *zone,
23629                                 int migratetype)
23630  {
23631         unsigned long nr_scanned;
23632 -       spin_lock(&zone->lock);
23633 +       unsigned long flags;
23634 +
23635 +       spin_lock_irqsave(&zone->lock, flags);
23636         nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
23637         if (nr_scanned)
23638                 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
23639 @@ -1142,7 +1181,7 @@ static void free_one_page(struct zone *zone,
23640                 migratetype = get_pfnblock_migratetype(page, pfn);
23641         }
23642         __free_one_page(page, pfn, zone, order, migratetype);
23643 -       spin_unlock(&zone->lock);
23644 +       spin_unlock_irqrestore(&zone->lock, flags);
23645  }
23646
23647  static void __meminit __init_single_page(struct page *page, unsigned long pfn,
23648 @@ -1228,10 +1267,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
23649                 return;
23650
23651         migratetype = get_pfnblock_migratetype(page, pfn);
23652 -       local_irq_save(flags);
23653 +       local_lock_irqsave(pa_lock, flags);
23654         __count_vm_events(PGFREE, 1 << order);
23655         free_one_page(page_zone(page), page, pfn, order, migratetype);
23656 -       local_irq_restore(flags);
23657 +       local_unlock_irqrestore(pa_lock, flags);
23658  }
23659
23660  static void __init __free_pages_boot_core(struct page *page, unsigned int order)
23661 @@ -2219,16 +2258,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
23662  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
23663  {
23664         unsigned long flags;
23665 +       LIST_HEAD(dst);
23666         int to_drain, batch;
23667
23668 -       local_irq_save(flags);
23669 +       local_lock_irqsave(pa_lock, flags);
23670         batch = READ_ONCE(pcp->batch);
23671         to_drain = min(pcp->count, batch);
23672         if (to_drain > 0) {
23673 -               free_pcppages_bulk(zone, to_drain, pcp);
23674 +               isolate_pcp_pages(to_drain, pcp, &dst);
23675                 pcp->count -= to_drain;
23676         }
23677 -       local_irq_restore(flags);
23678 +       local_unlock_irqrestore(pa_lock, flags);
23679 +       free_pcppages_bulk(zone, to_drain, &dst);
23680  }
23681  #endif
23682
23683 @@ -2244,16 +2285,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
23684         unsigned long flags;
23685         struct per_cpu_pageset *pset;
23686         struct per_cpu_pages *pcp;
23687 +       LIST_HEAD(dst);
23688 +       int count;
23689
23690 -       local_irq_save(flags);
23691 +       cpu_lock_irqsave(cpu, flags);
23692         pset = per_cpu_ptr(zone->pageset, cpu);
23693
23694         pcp = &pset->pcp;
23695 -       if (pcp->count) {
23696 -               free_pcppages_bulk(zone, pcp->count, pcp);
23697 +       count = pcp->count;
23698 +       if (count) {
23699 +               isolate_pcp_pages(count, pcp, &dst);
23700                 pcp->count = 0;
23701         }
23702 -       local_irq_restore(flags);
23703 +       cpu_unlock_irqrestore(cpu, flags);
23704 +       if (count)
23705 +               free_pcppages_bulk(zone, count, &dst);
23706  }
23707
23708  /*
23709 @@ -2339,8 +2385,17 @@ void drain_all_pages(struct zone *zone)
23710                 else
23711                         cpumask_clear_cpu(cpu, &cpus_with_pcps);
23712         }
23713 +#ifndef CONFIG_PREEMPT_RT_BASE
23714         on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
23715                                                                 zone, 1);
23716 +#else
23717 +       for_each_cpu(cpu, &cpus_with_pcps) {
23718 +               if (zone)
23719 +                       drain_pages_zone(cpu, zone);
23720 +               else
23721 +                       drain_pages(cpu);
23722 +       }
23723 +#endif
23724  }
23725
23726  #ifdef CONFIG_HIBERNATION
23727 @@ -2400,7 +2455,7 @@ void free_hot_cold_page(struct page *page, bool cold)
23728
23729         migratetype = get_pfnblock_migratetype(page, pfn);
23730         set_pcppage_migratetype(page, migratetype);
23731 -       local_irq_save(flags);
23732 +       local_lock_irqsave(pa_lock, flags);
23733         __count_vm_event(PGFREE);
23734
23735         /*
23736 @@ -2426,12 +2481,17 @@ void free_hot_cold_page(struct page *page, bool cold)
23737         pcp->count++;
23738         if (pcp->count >= pcp->high) {
23739                 unsigned long batch = READ_ONCE(pcp->batch);
23740 -               free_pcppages_bulk(zone, batch, pcp);
23741 +               LIST_HEAD(dst);
23742 +
23743 +               isolate_pcp_pages(batch, pcp, &dst);
23744                 pcp->count -= batch;
23745 +               local_unlock_irqrestore(pa_lock, flags);
23746 +               free_pcppages_bulk(zone, batch, &dst);
23747 +               return;
23748         }
23749
23750  out:
23751 -       local_irq_restore(flags);
23752 +       local_unlock_irqrestore(pa_lock, flags);
23753  }
23754
23755  /*
23756 @@ -2568,7 +2628,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
23757                 struct per_cpu_pages *pcp;
23758                 struct list_head *list;
23759
23760 -               local_irq_save(flags);
23761 +               local_lock_irqsave(pa_lock, flags);
23762                 do {
23763                         pcp = &this_cpu_ptr(zone->pageset)->pcp;
23764                         list = &pcp->lists[migratetype];
23765 @@ -2595,7 +2655,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
23766                  * allocate greater than order-1 page units with __GFP_NOFAIL.
23767                  */
23768                 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
23769 -               spin_lock_irqsave(&zone->lock, flags);
23770 +               local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
23771
23772                 do {
23773                         page = NULL;
23774 @@ -2607,22 +2667,24 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
23775                         if (!page)
23776                                 page = __rmqueue(zone, order, migratetype);
23777                 } while (page && check_new_pages(page, order));
23778 -               spin_unlock(&zone->lock);
23779 -               if (!page)
23780 +               if (!page) {
23781 +                       spin_unlock(&zone->lock);
23782                         goto failed;
23783 +               }
23784                 __mod_zone_freepage_state(zone, -(1 << order),
23785                                           get_pcppage_migratetype(page));
23786 +               spin_unlock(&zone->lock);
23787         }
23788
23789         __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
23790         zone_statistics(preferred_zone, zone, gfp_flags);
23791 -       local_irq_restore(flags);
23792 +       local_unlock_irqrestore(pa_lock, flags);
23793
23794         VM_BUG_ON_PAGE(bad_range(zone, page), page);
23795         return page;
23796
23797  failed:
23798 -       local_irq_restore(flags);
23799 +       local_unlock_irqrestore(pa_lock, flags);
23800         return NULL;
23801  }
23802
23803 @@ -6528,7 +6590,9 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
23804         int cpu = (unsigned long)hcpu;
23805
23806         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
23807 +               local_lock_irq_on(swapvec_lock, cpu);
23808                 lru_add_drain_cpu(cpu);
23809 +               local_unlock_irq_on(swapvec_lock, cpu);
23810                 drain_pages(cpu);
23811
23812                 /*
23813 @@ -6554,6 +6618,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
23814  void __init page_alloc_init(void)
23815  {
23816         hotcpu_notifier(page_alloc_cpu_notify, 0);
23817 +       local_irq_lock_init(pa_lock);
23818  }
23819
23820  /*
23821 @@ -7370,7 +7435,7 @@ void zone_pcp_reset(struct zone *zone)
23822         struct per_cpu_pageset *pset;
23823
23824         /* avoid races with drain_pages()  */
23825 -       local_irq_save(flags);
23826 +       local_lock_irqsave(pa_lock, flags);
23827         if (zone->pageset != &boot_pageset) {
23828                 for_each_online_cpu(cpu) {
23829                         pset = per_cpu_ptr(zone->pageset, cpu);
23830 @@ -7379,7 +7444,7 @@ void zone_pcp_reset(struct zone *zone)
23831                 free_percpu(zone->pageset);
23832                 zone->pageset = &boot_pageset;
23833         }
23834 -       local_irq_restore(flags);
23835 +       local_unlock_irqrestore(pa_lock, flags);
23836  }
23837
23838  #ifdef CONFIG_MEMORY_HOTREMOVE
23839 diff --git a/mm/slab.h b/mm/slab.h
23840 index 9653f2e2591a..b7371e026627 100644
23841 --- a/mm/slab.h
23842 +++ b/mm/slab.h
23843 @@ -426,7 +426,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
23844   * The slab lists for all objects.
23845   */
23846  struct kmem_cache_node {
23847 +#ifdef CONFIG_SLUB
23848 +       raw_spinlock_t list_lock;
23849 +#else
23850         spinlock_t list_lock;
23851 +#endif
23852
23853  #ifdef CONFIG_SLAB
23854         struct list_head slabs_partial; /* partial list first, better asm code */
23855 diff --git a/mm/slub.c b/mm/slub.c
23856 index 9adae58462f8..4b386747f050 100644
23857 --- a/mm/slub.c
23858 +++ b/mm/slub.c
23859 @@ -1145,7 +1145,7 @@ static noinline int free_debug_processing(
23860         unsigned long uninitialized_var(flags);
23861         int ret = 0;
23862
23863 -       spin_lock_irqsave(&n->list_lock, flags);
23864 +       raw_spin_lock_irqsave(&n->list_lock, flags);
23865         slab_lock(page);
23866
23867         if (s->flags & SLAB_CONSISTENCY_CHECKS) {
23868 @@ -1180,7 +1180,7 @@ static noinline int free_debug_processing(
23869                          bulk_cnt, cnt);
23870
23871         slab_unlock(page);
23872 -       spin_unlock_irqrestore(&n->list_lock, flags);
23873 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
23874         if (!ret)
23875                 slab_fix(s, "Object at 0x%p not freed", object);
23876         return ret;
23877 @@ -1308,6 +1308,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
23878
23879  #endif /* CONFIG_SLUB_DEBUG */
23880
23881 +struct slub_free_list {
23882 +       raw_spinlock_t          lock;
23883 +       struct list_head        list;
23884 +};
23885 +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list);
23886 +
23887  /*
23888   * Hooks for other subsystems that check memory allocations. In a typical
23889   * production configuration these hooks all should produce no code at all.
23890 @@ -1527,10 +1533,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
23891         void *start, *p;
23892         int idx, order;
23893         bool shuffle;
23894 +       bool enableirqs = false;
23895
23896         flags &= gfp_allowed_mask;
23897
23898         if (gfpflags_allow_blocking(flags))
23899 +               enableirqs = true;
23900 +#ifdef CONFIG_PREEMPT_RT_FULL
23901 +       if (system_state == SYSTEM_RUNNING)
23902 +               enableirqs = true;
23903 +#endif
23904 +       if (enableirqs)
23905                 local_irq_enable();
23906
23907         flags |= s->allocflags;
23908 @@ -1605,7 +1618,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
23909         page->frozen = 1;
23910
23911  out:
23912 -       if (gfpflags_allow_blocking(flags))
23913 +       if (enableirqs)
23914                 local_irq_disable();
23915         if (!page)
23916                 return NULL;
23917 @@ -1664,6 +1677,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
23918         __free_pages(page, order);
23919  }
23920
23921 +static void free_delayed(struct list_head *h)
23922 +{
23923 +       while(!list_empty(h)) {
23924 +               struct page *page = list_first_entry(h, struct page, lru);
23925 +
23926 +               list_del(&page->lru);
23927 +               __free_slab(page->slab_cache, page);
23928 +       }
23929 +}
23930 +
23931  #define need_reserve_slab_rcu                                          \
23932         (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
23933
23934 @@ -1695,6 +1718,12 @@ static void free_slab(struct kmem_cache *s, struct page *page)
23935                 }
23936
23937                 call_rcu(head, rcu_free_slab);
23938 +       } else if (irqs_disabled()) {
23939 +               struct slub_free_list *f = this_cpu_ptr(&slub_free_list);
23940 +
23941 +               raw_spin_lock(&f->lock);
23942 +               list_add(&page->lru, &f->list);
23943 +               raw_spin_unlock(&f->lock);
23944         } else
23945                 __free_slab(s, page);
23946  }
23947 @@ -1802,7 +1831,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
23948         if (!n || !n->nr_partial)
23949                 return NULL;
23950
23951 -       spin_lock(&n->list_lock);
23952 +       raw_spin_lock(&n->list_lock);
23953         list_for_each_entry_safe(page, page2, &n->partial, lru) {
23954                 void *t;
23955
23956 @@ -1827,7 +1856,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
23957                         break;
23958
23959         }
23960 -       spin_unlock(&n->list_lock);
23961 +       raw_spin_unlock(&n->list_lock);
23962         return object;
23963  }
23964
23965 @@ -2073,7 +2102,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
23966                          * that acquire_slab() will see a slab page that
23967                          * is frozen
23968                          */
23969 -                       spin_lock(&n->list_lock);
23970 +                       raw_spin_lock(&n->list_lock);
23971                 }
23972         } else {
23973                 m = M_FULL;
23974 @@ -2084,7 +2113,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
23975                          * slabs from diagnostic functions will not see
23976                          * any frozen slabs.
23977                          */
23978 -                       spin_lock(&n->list_lock);
23979 +                       raw_spin_lock(&n->list_lock);
23980                 }
23981         }
23982
23983 @@ -2119,7 +2148,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
23984                 goto redo;
23985
23986         if (lock)
23987 -               spin_unlock(&n->list_lock);
23988 +               raw_spin_unlock(&n->list_lock);
23989
23990         if (m == M_FREE) {
23991                 stat(s, DEACTIVATE_EMPTY);
23992 @@ -2151,10 +2180,10 @@ static void unfreeze_partials(struct kmem_cache *s,
23993                 n2 = get_node(s, page_to_nid(page));
23994                 if (n != n2) {
23995                         if (n)
23996 -                               spin_unlock(&n->list_lock);
23997 +                               raw_spin_unlock(&n->list_lock);
23998
23999                         n = n2;
24000 -                       spin_lock(&n->list_lock);
24001 +                       raw_spin_lock(&n->list_lock);
24002                 }
24003
24004                 do {
24005 @@ -2183,7 +2212,7 @@ static void unfreeze_partials(struct kmem_cache *s,
24006         }
24007
24008         if (n)
24009 -               spin_unlock(&n->list_lock);
24010 +               raw_spin_unlock(&n->list_lock);
24011
24012         while (discard_page) {
24013                 page = discard_page;
24014 @@ -2222,14 +2251,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
24015                         pobjects = oldpage->pobjects;
24016                         pages = oldpage->pages;
24017                         if (drain && pobjects > s->cpu_partial) {
24018 +                               struct slub_free_list *f;
24019                                 unsigned long flags;
24020 +                               LIST_HEAD(tofree);
24021                                 /*
24022                                  * partial array is full. Move the existing
24023                                  * set to the per node partial list.
24024                                  */
24025                                 local_irq_save(flags);
24026                                 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
24027 +                               f = this_cpu_ptr(&slub_free_list);
24028 +                               raw_spin_lock(&f->lock);
24029 +                               list_splice_init(&f->list, &tofree);
24030 +                               raw_spin_unlock(&f->lock);
24031                                 local_irq_restore(flags);
24032 +                               free_delayed(&tofree);
24033                                 oldpage = NULL;
24034                                 pobjects = 0;
24035                                 pages = 0;
24036 @@ -2301,7 +2337,22 @@ static bool has_cpu_slab(int cpu, void *info)
24037
24038  static void flush_all(struct kmem_cache *s)
24039  {
24040 +       LIST_HEAD(tofree);
24041 +       int cpu;
24042 +
24043         on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
24044 +       for_each_online_cpu(cpu) {
24045 +               struct slub_free_list *f;
24046 +
24047 +               if (!has_cpu_slab(cpu, s))
24048 +                       continue;
24049 +
24050 +               f = &per_cpu(slub_free_list, cpu);
24051 +               raw_spin_lock_irq(&f->lock);
24052 +               list_splice_init(&f->list, &tofree);
24053 +               raw_spin_unlock_irq(&f->lock);
24054 +               free_delayed(&tofree);
24055 +       }
24056  }
24057
24058  /*
24059 @@ -2337,10 +2388,10 @@ static unsigned long count_partial(struct kmem_cache_node *n,
24060         unsigned long x = 0;
24061         struct page *page;
24062
24063 -       spin_lock_irqsave(&n->list_lock, flags);
24064 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24065         list_for_each_entry(page, &n->partial, lru)
24066                 x += get_count(page);
24067 -       spin_unlock_irqrestore(&n->list_lock, flags);
24068 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24069         return x;
24070  }
24071  #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
24072 @@ -2478,8 +2529,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
24073   * already disabled (which is the case for bulk allocation).
24074   */
24075  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24076 -                         unsigned long addr, struct kmem_cache_cpu *c)
24077 +                         unsigned long addr, struct kmem_cache_cpu *c,
24078 +                         struct list_head *to_free)
24079  {
24080 +       struct slub_free_list *f;
24081         void *freelist;
24082         struct page *page;
24083
24084 @@ -2539,6 +2592,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24085         VM_BUG_ON(!c->page->frozen);
24086         c->freelist = get_freepointer(s, freelist);
24087         c->tid = next_tid(c->tid);
24088 +
24089 +out:
24090 +       f = this_cpu_ptr(&slub_free_list);
24091 +       raw_spin_lock(&f->lock);
24092 +       list_splice_init(&f->list, to_free);
24093 +       raw_spin_unlock(&f->lock);
24094 +
24095         return freelist;
24096
24097  new_slab:
24098 @@ -2570,7 +2630,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24099         deactivate_slab(s, page, get_freepointer(s, freelist));
24100         c->page = NULL;
24101         c->freelist = NULL;
24102 -       return freelist;
24103 +       goto out;
24104  }
24105
24106  /*
24107 @@ -2582,6 +2642,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24108  {
24109         void *p;
24110         unsigned long flags;
24111 +       LIST_HEAD(tofree);
24112
24113         local_irq_save(flags);
24114  #ifdef CONFIG_PREEMPT
24115 @@ -2593,8 +2654,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
24116         c = this_cpu_ptr(s->cpu_slab);
24117  #endif
24118
24119 -       p = ___slab_alloc(s, gfpflags, node, addr, c);
24120 +       p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree);
24121         local_irq_restore(flags);
24122 +       free_delayed(&tofree);
24123         return p;
24124  }
24125
24126 @@ -2780,7 +2842,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24127
24128         do {
24129                 if (unlikely(n)) {
24130 -                       spin_unlock_irqrestore(&n->list_lock, flags);
24131 +                       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24132                         n = NULL;
24133                 }
24134                 prior = page->freelist;
24135 @@ -2812,7 +2874,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24136                                  * Otherwise the list_lock will synchronize with
24137                                  * other processors updating the list of slabs.
24138                                  */
24139 -                               spin_lock_irqsave(&n->list_lock, flags);
24140 +                               raw_spin_lock_irqsave(&n->list_lock, flags);
24141
24142                         }
24143                 }
24144 @@ -2854,7 +2916,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24145                 add_partial(n, page, DEACTIVATE_TO_TAIL);
24146                 stat(s, FREE_ADD_PARTIAL);
24147         }
24148 -       spin_unlock_irqrestore(&n->list_lock, flags);
24149 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24150         return;
24151
24152  slab_empty:
24153 @@ -2869,7 +2931,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
24154                 remove_full(s, n, page);
24155         }
24156
24157 -       spin_unlock_irqrestore(&n->list_lock, flags);
24158 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24159         stat(s, FREE_SLAB);
24160         discard_slab(s, page);
24161  }
24162 @@ -3074,6 +3136,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
24163                           void **p)
24164  {
24165         struct kmem_cache_cpu *c;
24166 +       LIST_HEAD(to_free);
24167         int i;
24168
24169         /* memcg and kmem_cache debug support */
24170 @@ -3097,7 +3160,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
24171                          * of re-populating per CPU c->freelist
24172                          */
24173                         p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
24174 -                                           _RET_IP_, c);
24175 +                                           _RET_IP_, c, &to_free);
24176                         if (unlikely(!p[i]))
24177                                 goto error;
24178
24179 @@ -3109,6 +3172,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
24180         }
24181         c->tid = next_tid(c->tid);
24182         local_irq_enable();
24183 +       free_delayed(&to_free);
24184
24185         /* Clear memory outside IRQ disabled fastpath loop */
24186         if (unlikely(flags & __GFP_ZERO)) {
24187 @@ -3256,7 +3320,7 @@ static void
24188  init_kmem_cache_node(struct kmem_cache_node *n)
24189  {
24190         n->nr_partial = 0;
24191 -       spin_lock_init(&n->list_lock);
24192 +       raw_spin_lock_init(&n->list_lock);
24193         INIT_LIST_HEAD(&n->partial);
24194  #ifdef CONFIG_SLUB_DEBUG
24195         atomic_long_set(&n->nr_slabs, 0);
24196 @@ -3600,6 +3664,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
24197                                                         const char *text)
24198  {
24199  #ifdef CONFIG_SLUB_DEBUG
24200 +#ifdef CONFIG_PREEMPT_RT_BASE
24201 +       /* XXX move out of irq-off section */
24202 +       slab_err(s, page, text, s->name);
24203 +#else
24204         void *addr = page_address(page);
24205         void *p;
24206         unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
24207 @@ -3620,6 +3688,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
24208         slab_unlock(page);
24209         kfree(map);
24210  #endif
24211 +#endif
24212  }
24213
24214  /*
24215 @@ -3633,7 +3702,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
24216         struct page *page, *h;
24217
24218         BUG_ON(irqs_disabled());
24219 -       spin_lock_irq(&n->list_lock);
24220 +       raw_spin_lock_irq(&n->list_lock);
24221         list_for_each_entry_safe(page, h, &n->partial, lru) {
24222                 if (!page->inuse) {
24223                         remove_partial(n, page);
24224 @@ -3643,7 +3712,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
24225                         "Objects remaining in %s on __kmem_cache_shutdown()");
24226                 }
24227         }
24228 -       spin_unlock_irq(&n->list_lock);
24229 +       raw_spin_unlock_irq(&n->list_lock);
24230
24231         list_for_each_entry_safe(page, h, &discard, lru)
24232                 discard_slab(s, page);
24233 @@ -3901,7 +3970,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
24234                 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
24235                         INIT_LIST_HEAD(promote + i);
24236
24237 -               spin_lock_irqsave(&n->list_lock, flags);
24238 +               raw_spin_lock_irqsave(&n->list_lock, flags);
24239
24240                 /*
24241                  * Build lists of slabs to discard or promote.
24242 @@ -3932,7 +4001,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
24243                 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
24244                         list_splice(promote + i, &n->partial);
24245
24246 -               spin_unlock_irqrestore(&n->list_lock, flags);
24247 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
24248
24249                 /* Release empty slabs */
24250                 list_for_each_entry_safe(page, t, &discard, lru)
24251 @@ -4108,6 +4177,12 @@ void __init kmem_cache_init(void)
24252  {
24253         static __initdata struct kmem_cache boot_kmem_cache,
24254                 boot_kmem_cache_node;
24255 +       int cpu;
24256 +
24257 +       for_each_possible_cpu(cpu) {
24258 +               raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock);
24259 +               INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list);
24260 +       }
24261
24262         if (debug_guardpage_minorder())
24263                 slub_max_order = 0;
24264 @@ -4354,7 +4429,7 @@ static int validate_slab_node(struct kmem_cache *s,
24265         struct page *page;
24266         unsigned long flags;
24267
24268 -       spin_lock_irqsave(&n->list_lock, flags);
24269 +       raw_spin_lock_irqsave(&n->list_lock, flags);
24270
24271         list_for_each_entry(page, &n->partial, lru) {
24272                 validate_slab_slab(s, page, map);
24273 @@ -4376,7 +4451,7 @@ static int validate_slab_node(struct kmem_cache *s,
24274                        s->name, count, atomic_long_read(&n->nr_slabs));
24275
24276  out:
24277 -       spin_unlock_irqrestore(&n->list_lock, flags);
24278 +       raw_spin_unlock_irqrestore(&n->list_lock, flags);
24279         return count;
24280  }
24281
24282 @@ -4564,12 +4639,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
24283                 if (!atomic_long_read(&n->nr_slabs))
24284                         continue;
24285
24286 -               spin_lock_irqsave(&n->list_lock, flags);
24287 +               raw_spin_lock_irqsave(&n->list_lock, flags);
24288                 list_for_each_entry(page, &n->partial, lru)
24289                         process_slab(&t, s, page, alloc, map);
24290                 list_for_each_entry(page, &n->full, lru)
24291                         process_slab(&t, s, page, alloc, map);
24292 -               spin_unlock_irqrestore(&n->list_lock, flags);
24293 +               raw_spin_unlock_irqrestore(&n->list_lock, flags);
24294         }
24295
24296         for (i = 0; i < t.count; i++) {
24297 diff --git a/mm/swap.c b/mm/swap.c
24298 index 75c63bb2a1da..93fe549eb11e 100644
24299 --- a/mm/swap.c
24300 +++ b/mm/swap.c
24301 @@ -32,6 +32,7 @@
24302  #include <linux/memcontrol.h>
24303  #include <linux/gfp.h>
24304  #include <linux/uio.h>
24305 +#include <linux/locallock.h>
24306  #include <linux/hugetlb.h>
24307  #include <linux/page_idle.h>
24308
24309 @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
24310  #ifdef CONFIG_SMP
24311  static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
24312  #endif
24313 +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock);
24314 +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock);
24315
24316  /*
24317   * This path almost never happens for VM activity - pages are normally
24318 @@ -240,11 +243,11 @@ void rotate_reclaimable_page(struct page *page)
24319                 unsigned long flags;
24320
24321                 get_page(page);
24322 -               local_irq_save(flags);
24323 +               local_lock_irqsave(rotate_lock, flags);
24324                 pvec = this_cpu_ptr(&lru_rotate_pvecs);
24325                 if (!pagevec_add(pvec, page) || PageCompound(page))
24326                         pagevec_move_tail(pvec);
24327 -               local_irq_restore(flags);
24328 +               local_unlock_irqrestore(rotate_lock, flags);
24329         }
24330  }
24331
24332 @@ -294,12 +297,13 @@ void activate_page(struct page *page)
24333  {
24334         page = compound_head(page);
24335         if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
24336 -               struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
24337 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24338 +                                                      activate_page_pvecs);
24339
24340                 get_page(page);
24341                 if (!pagevec_add(pvec, page) || PageCompound(page))
24342                         pagevec_lru_move_fn(pvec, __activate_page, NULL);
24343 -               put_cpu_var(activate_page_pvecs);
24344 +               put_locked_var(swapvec_lock, activate_page_pvecs);
24345         }
24346  }
24347
24348 @@ -326,7 +330,7 @@ void activate_page(struct page *page)
24349
24350  static void __lru_cache_activate_page(struct page *page)
24351  {
24352 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
24353 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
24354         int i;
24355
24356         /*
24357 @@ -348,7 +352,7 @@ static void __lru_cache_activate_page(struct page *page)
24358                 }
24359         }
24360
24361 -       put_cpu_var(lru_add_pvec);
24362 +       put_locked_var(swapvec_lock, lru_add_pvec);
24363  }
24364
24365  /*
24366 @@ -390,12 +394,12 @@ EXPORT_SYMBOL(mark_page_accessed);
24367
24368  static void __lru_cache_add(struct page *page)
24369  {
24370 -       struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
24371 +       struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec);
24372
24373         get_page(page);
24374         if (!pagevec_add(pvec, page) || PageCompound(page))
24375                 __pagevec_lru_add(pvec);
24376 -       put_cpu_var(lru_add_pvec);
24377 +       put_locked_var(swapvec_lock, lru_add_pvec);
24378  }
24379
24380  /**
24381 @@ -593,9 +597,15 @@ void lru_add_drain_cpu(int cpu)
24382                 unsigned long flags;
24383
24384                 /* No harm done if a racing interrupt already did this */
24385 -               local_irq_save(flags);
24386 +#ifdef CONFIG_PREEMPT_RT_BASE
24387 +               local_lock_irqsave_on(rotate_lock, flags, cpu);
24388                 pagevec_move_tail(pvec);
24389 -               local_irq_restore(flags);
24390 +               local_unlock_irqrestore_on(rotate_lock, flags, cpu);
24391 +#else
24392 +               local_lock_irqsave(rotate_lock, flags);
24393 +               pagevec_move_tail(pvec);
24394 +               local_unlock_irqrestore(rotate_lock, flags);
24395 +#endif
24396         }
24397
24398         pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
24399 @@ -627,11 +637,12 @@ void deactivate_file_page(struct page *page)
24400                 return;
24401
24402         if (likely(get_page_unless_zero(page))) {
24403 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
24404 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24405 +                                                      lru_deactivate_file_pvecs);
24406
24407                 if (!pagevec_add(pvec, page) || PageCompound(page))
24408                         pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
24409 -               put_cpu_var(lru_deactivate_file_pvecs);
24410 +               put_locked_var(swapvec_lock, lru_deactivate_file_pvecs);
24411         }
24412  }
24413
24414 @@ -646,27 +657,31 @@ void deactivate_file_page(struct page *page)
24415  void deactivate_page(struct page *page)
24416  {
24417         if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
24418 -               struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
24419 +               struct pagevec *pvec = &get_locked_var(swapvec_lock,
24420 +                                                      lru_deactivate_pvecs);
24421
24422                 get_page(page);
24423                 if (!pagevec_add(pvec, page) || PageCompound(page))
24424                         pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
24425 -               put_cpu_var(lru_deactivate_pvecs);
24426 +               put_locked_var(swapvec_lock, lru_deactivate_pvecs);
24427         }
24428  }
24429
24430  void lru_add_drain(void)
24431  {
24432 -       lru_add_drain_cpu(get_cpu());
24433 -       put_cpu();
24434 +       lru_add_drain_cpu(local_lock_cpu(swapvec_lock));
24435 +       local_unlock_cpu(swapvec_lock);
24436  }
24437
24438 -static void lru_add_drain_per_cpu(struct work_struct *dummy)
24439 +#ifdef CONFIG_PREEMPT_RT_BASE
24440 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
24441  {
24442 -       lru_add_drain();
24443 +       local_lock_on(swapvec_lock, cpu);
24444 +       lru_add_drain_cpu(cpu);
24445 +       local_unlock_on(swapvec_lock, cpu);
24446  }
24447
24448 -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
24449 +#else
24450
24451  /*
24452   * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
24453 @@ -686,6 +701,22 @@ static int __init lru_init(void)
24454  }
24455  early_initcall(lru_init);
24456
24457 +static void lru_add_drain_per_cpu(struct work_struct *dummy)
24458 +{
24459 +       lru_add_drain();
24460 +}
24461 +
24462 +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
24463 +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
24464 +{
24465 +       struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
24466 +
24467 +       INIT_WORK(work, lru_add_drain_per_cpu);
24468 +       queue_work_on(cpu, lru_add_drain_wq, work);
24469 +       cpumask_set_cpu(cpu, has_work);
24470 +}
24471 +#endif
24472 +
24473  void lru_add_drain_all(void)
24474  {
24475         static DEFINE_MUTEX(lock);
24476 @@ -697,21 +728,18 @@ void lru_add_drain_all(void)
24477         cpumask_clear(&has_work);
24478
24479         for_each_online_cpu(cpu) {
24480 -               struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
24481 -
24482                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
24483                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
24484                     pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
24485                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
24486 -                   need_activate_page_drain(cpu)) {
24487 -                       INIT_WORK(work, lru_add_drain_per_cpu);
24488 -                       queue_work_on(cpu, lru_add_drain_wq, work);
24489 -                       cpumask_set_cpu(cpu, &has_work);
24490 -               }
24491 +                   need_activate_page_drain(cpu))
24492 +                       remote_lru_add_drain(cpu, &has_work);
24493         }
24494
24495 +#ifndef CONFIG_PREEMPT_RT_BASE
24496         for_each_cpu(cpu, &has_work)
24497                 flush_work(&per_cpu(lru_add_drain_work, cpu));
24498 +#endif
24499
24500         put_online_cpus();
24501         mutex_unlock(&lock);
24502 diff --git a/mm/truncate.c b/mm/truncate.c
24503 index a01cce450a26..4bda37604f99 100644
24504 --- a/mm/truncate.c
24505 +++ b/mm/truncate.c
24506 @@ -62,9 +62,12 @@ static void clear_exceptional_entry(struct address_space *mapping,
24507          * protected by mapping->tree_lock.
24508          */
24509         if (!workingset_node_shadows(node) &&
24510 -           !list_empty(&node->private_list))
24511 -               list_lru_del(&workingset_shadow_nodes,
24512 +           !list_empty(&node->private_list)) {
24513 +               local_lock(workingset_shadow_lock);
24514 +               list_lru_del(&__workingset_shadow_nodes,
24515                                 &node->private_list);
24516 +               local_unlock(workingset_shadow_lock);
24517 +       }
24518         __radix_tree_delete_node(&mapping->page_tree, node);
24519  unlock:
24520         spin_unlock_irq(&mapping->tree_lock);
24521 diff --git a/mm/vmalloc.c b/mm/vmalloc.c
24522 index 91f44e78c516..06ec393bb97d 100644
24523 --- a/mm/vmalloc.c
24524 +++ b/mm/vmalloc.c
24525 @@ -845,7 +845,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
24526         struct vmap_block *vb;
24527         struct vmap_area *va;
24528         unsigned long vb_idx;
24529 -       int node, err;
24530 +       int node, err, cpu;
24531         void *vaddr;
24532
24533         node = numa_node_id();
24534 @@ -888,11 +888,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
24535         BUG_ON(err);
24536         radix_tree_preload_end();
24537
24538 -       vbq = &get_cpu_var(vmap_block_queue);
24539 +       cpu = get_cpu_light();
24540 +       vbq = this_cpu_ptr(&vmap_block_queue);
24541         spin_lock(&vbq->lock);
24542         list_add_tail_rcu(&vb->free_list, &vbq->free);
24543         spin_unlock(&vbq->lock);
24544 -       put_cpu_var(vmap_block_queue);
24545 +       put_cpu_light();
24546
24547         return vaddr;
24548  }
24549 @@ -961,6 +962,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
24550         struct vmap_block *vb;
24551         void *vaddr = NULL;
24552         unsigned int order;
24553 +       int cpu;
24554
24555         BUG_ON(offset_in_page(size));
24556         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
24557 @@ -975,7 +977,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
24558         order = get_order(size);
24559
24560         rcu_read_lock();
24561 -       vbq = &get_cpu_var(vmap_block_queue);
24562 +       cpu = get_cpu_light();
24563 +       vbq = this_cpu_ptr(&vmap_block_queue);
24564         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
24565                 unsigned long pages_off;
24566
24567 @@ -998,7 +1001,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
24568                 break;
24569         }
24570
24571 -       put_cpu_var(vmap_block_queue);
24572 +       put_cpu_light();
24573         rcu_read_unlock();
24574
24575         /* Allocate new block if nothing was found */
24576 diff --git a/mm/vmstat.c b/mm/vmstat.c
24577 index 89cec42d19ff..fb73631fb90b 100644
24578 --- a/mm/vmstat.c
24579 +++ b/mm/vmstat.c
24580 @@ -245,6 +245,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
24581         long x;
24582         long t;
24583
24584 +       preempt_disable_rt();
24585         x = delta + __this_cpu_read(*p);
24586
24587         t = __this_cpu_read(pcp->stat_threshold);
24588 @@ -254,6 +255,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
24589                 x = 0;
24590         }
24591         __this_cpu_write(*p, x);
24592 +       preempt_enable_rt();
24593  }
24594  EXPORT_SYMBOL(__mod_zone_page_state);
24595
24596 @@ -265,6 +267,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
24597         long x;
24598         long t;
24599
24600 +       preempt_disable_rt();
24601         x = delta + __this_cpu_read(*p);
24602
24603         t = __this_cpu_read(pcp->stat_threshold);
24604 @@ -274,6 +277,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
24605                 x = 0;
24606         }
24607         __this_cpu_write(*p, x);
24608 +       preempt_enable_rt();
24609  }
24610  EXPORT_SYMBOL(__mod_node_page_state);
24611
24612 @@ -306,6 +310,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
24613         s8 __percpu *p = pcp->vm_stat_diff + item;
24614         s8 v, t;
24615
24616 +       preempt_disable_rt();
24617         v = __this_cpu_inc_return(*p);
24618         t = __this_cpu_read(pcp->stat_threshold);
24619         if (unlikely(v > t)) {
24620 @@ -314,6 +319,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
24621                 zone_page_state_add(v + overstep, zone, item);
24622                 __this_cpu_write(*p, -overstep);
24623         }
24624 +       preempt_enable_rt();
24625  }
24626
24627  void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24628 @@ -322,6 +328,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24629         s8 __percpu *p = pcp->vm_node_stat_diff + item;
24630         s8 v, t;
24631
24632 +       preempt_disable_rt();
24633         v = __this_cpu_inc_return(*p);
24634         t = __this_cpu_read(pcp->stat_threshold);
24635         if (unlikely(v > t)) {
24636 @@ -330,6 +337,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24637                 node_page_state_add(v + overstep, pgdat, item);
24638                 __this_cpu_write(*p, -overstep);
24639         }
24640 +       preempt_enable_rt();
24641  }
24642
24643  void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
24644 @@ -350,6 +358,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
24645         s8 __percpu *p = pcp->vm_stat_diff + item;
24646         s8 v, t;
24647
24648 +       preempt_disable_rt();
24649         v = __this_cpu_dec_return(*p);
24650         t = __this_cpu_read(pcp->stat_threshold);
24651         if (unlikely(v < - t)) {
24652 @@ -358,6 +367,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
24653                 zone_page_state_add(v - overstep, zone, item);
24654                 __this_cpu_write(*p, overstep);
24655         }
24656 +       preempt_enable_rt();
24657  }
24658
24659  void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24660 @@ -366,6 +376,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24661         s8 __percpu *p = pcp->vm_node_stat_diff + item;
24662         s8 v, t;
24663
24664 +       preempt_disable_rt();
24665         v = __this_cpu_dec_return(*p);
24666         t = __this_cpu_read(pcp->stat_threshold);
24667         if (unlikely(v < - t)) {
24668 @@ -374,6 +385,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
24669                 node_page_state_add(v - overstep, pgdat, item);
24670                 __this_cpu_write(*p, overstep);
24671         }
24672 +       preempt_enable_rt();
24673  }
24674
24675  void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
24676 diff --git a/mm/workingset.c b/mm/workingset.c
24677 index 617475f529f4..48674bf36fb1 100644
24678 --- a/mm/workingset.c
24679 +++ b/mm/workingset.c
24680 @@ -334,7 +334,8 @@ void workingset_activation(struct page *page)
24681   * point where they would still be useful.
24682   */
24683
24684 -struct list_lru workingset_shadow_nodes;
24685 +struct list_lru __workingset_shadow_nodes;
24686 +DEFINE_LOCAL_IRQ_LOCK(workingset_shadow_lock);
24687
24688  static unsigned long count_shadow_nodes(struct shrinker *shrinker,
24689                                         struct shrink_control *sc)
24690 @@ -344,9 +345,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
24691         unsigned long pages;
24692
24693         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
24694 -       local_irq_disable();
24695 -       shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
24696 -       local_irq_enable();
24697 +       local_lock_irq(workingset_shadow_lock);
24698 +       shadow_nodes = list_lru_shrink_count(&__workingset_shadow_nodes, sc);
24699 +       local_unlock_irq(workingset_shadow_lock);
24700
24701         if (memcg_kmem_enabled()) {
24702                 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
24703 @@ -438,9 +439,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
24704         spin_unlock(&mapping->tree_lock);
24705         ret = LRU_REMOVED_RETRY;
24706  out:
24707 -       local_irq_enable();
24708 +       local_unlock_irq(workingset_shadow_lock);
24709         cond_resched();
24710 -       local_irq_disable();
24711 +       local_lock_irq(workingset_shadow_lock);
24712         spin_lock(lru_lock);
24713         return ret;
24714  }
24715 @@ -451,10 +452,10 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
24716         unsigned long ret;
24717
24718         /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
24719 -       local_irq_disable();
24720 -       ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
24721 +       local_lock_irq(workingset_shadow_lock);
24722 +       ret =  list_lru_shrink_walk(&__workingset_shadow_nodes, sc,
24723                                     shadow_lru_isolate, NULL);
24724 -       local_irq_enable();
24725 +       local_unlock_irq(workingset_shadow_lock);
24726         return ret;
24727  }
24728
24729 @@ -492,7 +493,7 @@ static int __init workingset_init(void)
24730         pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
24731                timestamp_bits, max_order, bucket_order);
24732
24733 -       ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
24734 +       ret = list_lru_init_key(&__workingset_shadow_nodes, &shadow_nodes_key);
24735         if (ret)
24736                 goto err;
24737         ret = register_shrinker(&workingset_shadow_shrinker);
24738 @@ -500,7 +501,7 @@ static int __init workingset_init(void)
24739                 goto err_list_lru;
24740         return 0;
24741  err_list_lru:
24742 -       list_lru_destroy(&workingset_shadow_nodes);
24743 +       list_lru_destroy(&__workingset_shadow_nodes);
24744  err:
24745         return ret;
24746  }
24747 diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
24748 index b0bc023d25c5..5af6426fbcbe 100644
24749 --- a/mm/zsmalloc.c
24750 +++ b/mm/zsmalloc.c
24751 @@ -53,6 +53,7 @@
24752  #include <linux/mount.h>
24753  #include <linux/migrate.h>
24754  #include <linux/pagemap.h>
24755 +#include <linux/locallock.h>
24756
24757  #define ZSPAGE_MAGIC   0x58
24758
24759 @@ -70,9 +71,22 @@
24760   */
24761  #define ZS_MAX_ZSPAGE_ORDER 2
24762  #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
24763 -
24764  #define ZS_HANDLE_SIZE (sizeof(unsigned long))
24765
24766 +#ifdef CONFIG_PREEMPT_RT_FULL
24767 +
24768 +struct zsmalloc_handle {
24769 +       unsigned long addr;
24770 +       struct mutex lock;
24771 +};
24772 +
24773 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle))
24774 +
24775 +#else
24776 +
24777 +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long))
24778 +#endif
24779 +
24780  /*
24781   * Object location (<PFN>, <obj_idx>) is encoded as
24782   * as single (unsigned long) handle value.
24783 @@ -327,7 +341,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
24784
24785  static int create_cache(struct zs_pool *pool)
24786  {
24787 -       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
24788 +       pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE,
24789                                         0, 0, NULL);
24790         if (!pool->handle_cachep)
24791                 return 1;
24792 @@ -351,10 +365,27 @@ static void destroy_cache(struct zs_pool *pool)
24793
24794  static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
24795  {
24796 -       return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
24797 -                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
24798 +       void *p;
24799 +
24800 +       p = kmem_cache_alloc(pool->handle_cachep,
24801 +                            gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
24802 +#ifdef CONFIG_PREEMPT_RT_FULL
24803 +       if (p) {
24804 +               struct zsmalloc_handle *zh = p;
24805 +
24806 +               mutex_init(&zh->lock);
24807 +       }
24808 +#endif
24809 +       return (unsigned long)p;
24810  }
24811
24812 +#ifdef CONFIG_PREEMPT_RT_FULL
24813 +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle)
24814 +{
24815 +       return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1));
24816 +}
24817 +#endif
24818 +
24819  static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
24820  {
24821         kmem_cache_free(pool->handle_cachep, (void *)handle);
24822 @@ -373,12 +404,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
24823
24824  static void record_obj(unsigned long handle, unsigned long obj)
24825  {
24826 +#ifdef CONFIG_PREEMPT_RT_FULL
24827 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24828 +
24829 +       WRITE_ONCE(zh->addr, obj);
24830 +#else
24831         /*
24832          * lsb of @obj represents handle lock while other bits
24833          * represent object value the handle is pointing so
24834          * updating shouldn't do store tearing.
24835          */
24836         WRITE_ONCE(*(unsigned long *)handle, obj);
24837 +#endif
24838  }
24839
24840  /* zpool driver */
24841 @@ -467,6 +504,7 @@ MODULE_ALIAS("zpool-zsmalloc");
24842
24843  /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
24844  static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
24845 +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock);
24846
24847  static bool is_zspage_isolated(struct zspage *zspage)
24848  {
24849 @@ -902,7 +940,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
24850
24851  static unsigned long handle_to_obj(unsigned long handle)
24852  {
24853 +#ifdef CONFIG_PREEMPT_RT_FULL
24854 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24855 +
24856 +       return zh->addr;
24857 +#else
24858         return *(unsigned long *)handle;
24859 +#endif
24860  }
24861
24862  static unsigned long obj_to_head(struct page *page, void *obj)
24863 @@ -916,22 +960,46 @@ static unsigned long obj_to_head(struct page *page, void *obj)
24864
24865  static inline int testpin_tag(unsigned long handle)
24866  {
24867 +#ifdef CONFIG_PREEMPT_RT_FULL
24868 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24869 +
24870 +       return mutex_is_locked(&zh->lock);
24871 +#else
24872         return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
24873 +#endif
24874  }
24875
24876  static inline int trypin_tag(unsigned long handle)
24877  {
24878 +#ifdef CONFIG_PREEMPT_RT_FULL
24879 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24880 +
24881 +       return mutex_trylock(&zh->lock);
24882 +#else
24883         return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
24884 +#endif
24885  }
24886
24887  static void pin_tag(unsigned long handle)
24888  {
24889 +#ifdef CONFIG_PREEMPT_RT_FULL
24890 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24891 +
24892 +       return mutex_lock(&zh->lock);
24893 +#else
24894         bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
24895 +#endif
24896  }
24897
24898  static void unpin_tag(unsigned long handle)
24899  {
24900 +#ifdef CONFIG_PREEMPT_RT_FULL
24901 +       struct zsmalloc_handle *zh = zs_get_pure_handle(handle);
24902 +
24903 +       return mutex_unlock(&zh->lock);
24904 +#else
24905         bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
24906 +#endif
24907  }
24908
24909  static void reset_page(struct page *page)
24910 @@ -1423,7 +1491,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
24911         class = pool->size_class[class_idx];
24912         off = (class->size * obj_idx) & ~PAGE_MASK;
24913
24914 -       area = &get_cpu_var(zs_map_area);
24915 +       area = &get_locked_var(zs_map_area_lock, zs_map_area);
24916         area->vm_mm = mm;
24917         if (off + class->size <= PAGE_SIZE) {
24918                 /* this object is contained entirely within a page */
24919 @@ -1477,7 +1545,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
24920
24921                 __zs_unmap_object(area, pages, off, class->size);
24922         }
24923 -       put_cpu_var(zs_map_area);
24924 +       put_locked_var(zs_map_area_lock, zs_map_area);
24925
24926         migrate_read_unlock(zspage);
24927         unpin_tag(handle);
24928 diff --git a/net/core/dev.c b/net/core/dev.c
24929 index ea6312057a71..d114a4692cde 100644
24930 --- a/net/core/dev.c
24931 +++ b/net/core/dev.c
24932 @@ -190,6 +190,7 @@ static unsigned int napi_gen_id = NR_CPUS;
24933  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
24934
24935  static seqcount_t devnet_rename_seq;
24936 +static DEFINE_MUTEX(devnet_rename_mutex);
24937
24938  static inline void dev_base_seq_inc(struct net *net)
24939  {
24940 @@ -211,14 +212,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
24941  static inline void rps_lock(struct softnet_data *sd)
24942  {
24943  #ifdef CONFIG_RPS
24944 -       spin_lock(&sd->input_pkt_queue.lock);
24945 +       raw_spin_lock(&sd->input_pkt_queue.raw_lock);
24946  #endif
24947  }
24948
24949  static inline void rps_unlock(struct softnet_data *sd)
24950  {
24951  #ifdef CONFIG_RPS
24952 -       spin_unlock(&sd->input_pkt_queue.lock);
24953 +       raw_spin_unlock(&sd->input_pkt_queue.raw_lock);
24954  #endif
24955  }
24956
24957 @@ -888,7 +889,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
24958         strcpy(name, dev->name);
24959         rcu_read_unlock();
24960         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
24961 -               cond_resched();
24962 +               mutex_lock(&devnet_rename_mutex);
24963 +               mutex_unlock(&devnet_rename_mutex);
24964                 goto retry;
24965         }
24966
24967 @@ -1157,20 +1159,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
24968         if (dev->flags & IFF_UP)
24969                 return -EBUSY;
24970
24971 -       write_seqcount_begin(&devnet_rename_seq);
24972 +       mutex_lock(&devnet_rename_mutex);
24973 +       __raw_write_seqcount_begin(&devnet_rename_seq);
24974
24975 -       if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
24976 -               write_seqcount_end(&devnet_rename_seq);
24977 -               return 0;
24978 -       }
24979 +       if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
24980 +               goto outunlock;
24981
24982         memcpy(oldname, dev->name, IFNAMSIZ);
24983
24984         err = dev_get_valid_name(net, dev, newname);
24985 -       if (err < 0) {
24986 -               write_seqcount_end(&devnet_rename_seq);
24987 -               return err;
24988 -       }
24989 +       if (err < 0)
24990 +               goto outunlock;
24991
24992         if (oldname[0] && !strchr(oldname, '%'))
24993                 netdev_info(dev, "renamed from %s\n", oldname);
24994 @@ -1183,11 +1182,12 @@ int dev_change_name(struct net_device *dev, const char *newname)
24995         if (ret) {
24996                 memcpy(dev->name, oldname, IFNAMSIZ);
24997                 dev->name_assign_type = old_assign_type;
24998 -               write_seqcount_end(&devnet_rename_seq);
24999 -               return ret;
25000 +               err = ret;
25001 +               goto outunlock;
25002         }
25003
25004 -       write_seqcount_end(&devnet_rename_seq);
25005 +       __raw_write_seqcount_end(&devnet_rename_seq);
25006 +       mutex_unlock(&devnet_rename_mutex);
25007
25008         netdev_adjacent_rename_links(dev, oldname);
25009
25010 @@ -1208,7 +1208,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
25011                 /* err >= 0 after dev_alloc_name() or stores the first errno */
25012                 if (err >= 0) {
25013                         err = ret;
25014 -                       write_seqcount_begin(&devnet_rename_seq);
25015 +                       mutex_lock(&devnet_rename_mutex);
25016 +                       __raw_write_seqcount_begin(&devnet_rename_seq);
25017                         memcpy(dev->name, oldname, IFNAMSIZ);
25018                         memcpy(oldname, newname, IFNAMSIZ);
25019                         dev->name_assign_type = old_assign_type;
25020 @@ -1221,6 +1222,11 @@ int dev_change_name(struct net_device *dev, const char *newname)
25021         }
25022
25023         return err;
25024 +
25025 +outunlock:
25026 +       __raw_write_seqcount_end(&devnet_rename_seq);
25027 +       mutex_unlock(&devnet_rename_mutex);
25028 +       return err;
25029  }
25030
25031  /**
25032 @@ -2268,6 +2274,7 @@ static void __netif_reschedule(struct Qdisc *q)
25033         sd->output_queue_tailp = &q->next_sched;
25034         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25035         local_irq_restore(flags);
25036 +       preempt_check_resched_rt();
25037  }
25038
25039  void __netif_schedule(struct Qdisc *q)
25040 @@ -2349,6 +2356,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
25041         __this_cpu_write(softnet_data.completion_queue, skb);
25042         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25043         local_irq_restore(flags);
25044 +       preempt_check_resched_rt();
25045  }
25046  EXPORT_SYMBOL(__dev_kfree_skb_irq);
25047
25048 @@ -3082,7 +3090,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
25049          * This permits qdisc->running owner to get the lock more
25050          * often and dequeue packets faster.
25051          */
25052 +#ifdef CONFIG_PREEMPT_RT_FULL
25053 +       contended = true;
25054 +#else
25055         contended = qdisc_is_running(q);
25056 +#endif
25057         if (unlikely(contended))
25058                 spin_lock(&q->busylock);
25059
25060 @@ -3145,8 +3157,10 @@ static void skb_update_prio(struct sk_buff *skb)
25061  #define skb_update_prio(skb)
25062  #endif
25063
25064 +#ifndef CONFIG_PREEMPT_RT_FULL
25065  DEFINE_PER_CPU(int, xmit_recursion);
25066  EXPORT_SYMBOL(xmit_recursion);
25067 +#endif
25068
25069  /**
25070   *     dev_loopback_xmit - loop back @skb
25071 @@ -3390,8 +3404,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
25072                 int cpu = smp_processor_id(); /* ok because BHs are off */
25073
25074                 if (txq->xmit_lock_owner != cpu) {
25075 -                       if (unlikely(__this_cpu_read(xmit_recursion) >
25076 -                                    XMIT_RECURSION_LIMIT))
25077 +                       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT))
25078                                 goto recursion_alert;
25079
25080                         skb = validate_xmit_skb(skb, dev);
25081 @@ -3401,9 +3414,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
25082                         HARD_TX_LOCK(dev, txq, cpu);
25083
25084                         if (!netif_xmit_stopped(txq)) {
25085 -                               __this_cpu_inc(xmit_recursion);
25086 +                               xmit_rec_inc();
25087                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
25088 -                               __this_cpu_dec(xmit_recursion);
25089 +                               xmit_rec_dec();
25090                                 if (dev_xmit_complete(rc)) {
25091                                         HARD_TX_UNLOCK(dev, txq);
25092                                         goto out;
25093 @@ -3777,6 +3790,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
25094         rps_unlock(sd);
25095
25096         local_irq_restore(flags);
25097 +       preempt_check_resched_rt();
25098
25099         atomic_long_inc(&skb->dev->rx_dropped);
25100         kfree_skb(skb);
25101 @@ -3795,7 +3809,7 @@ static int netif_rx_internal(struct sk_buff *skb)
25102                 struct rps_dev_flow voidflow, *rflow = &voidflow;
25103                 int cpu;
25104
25105 -               preempt_disable();
25106 +               migrate_disable();
25107                 rcu_read_lock();
25108
25109                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
25110 @@ -3805,13 +3819,13 @@ static int netif_rx_internal(struct sk_buff *skb)
25111                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
25112
25113                 rcu_read_unlock();
25114 -               preempt_enable();
25115 +               migrate_enable();
25116         } else
25117  #endif
25118         {
25119                 unsigned int qtail;
25120 -               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
25121 -               put_cpu();
25122 +               ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail);
25123 +               put_cpu_light();
25124         }
25125         return ret;
25126  }
25127 @@ -3845,11 +3859,9 @@ int netif_rx_ni(struct sk_buff *skb)
25128
25129         trace_netif_rx_ni_entry(skb);
25130
25131 -       preempt_disable();
25132 +       local_bh_disable();
25133         err = netif_rx_internal(skb);
25134 -       if (local_softirq_pending())
25135 -               do_softirq();
25136 -       preempt_enable();
25137 +       local_bh_enable();
25138
25139         return err;
25140  }
25141 @@ -4321,7 +4333,7 @@ static void flush_backlog(void *arg)
25142         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
25143                 if (skb->dev == dev) {
25144                         __skb_unlink(skb, &sd->input_pkt_queue);
25145 -                       kfree_skb(skb);
25146 +                       __skb_queue_tail(&sd->tofree_queue, skb);
25147                         input_queue_head_incr(sd);
25148                 }
25149         }
25150 @@ -4330,10 +4342,13 @@ static void flush_backlog(void *arg)
25151         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
25152                 if (skb->dev == dev) {
25153                         __skb_unlink(skb, &sd->process_queue);
25154 -                       kfree_skb(skb);
25155 +                       __skb_queue_tail(&sd->tofree_queue, skb);
25156                         input_queue_head_incr(sd);
25157                 }
25158         }
25159 +
25160 +       if (!skb_queue_empty(&sd->tofree_queue))
25161 +               raise_softirq_irqoff(NET_RX_SOFTIRQ);
25162  }
25163
25164  static int napi_gro_complete(struct sk_buff *skb)
25165 @@ -4795,6 +4810,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
25166                 sd->rps_ipi_list = NULL;
25167
25168                 local_irq_enable();
25169 +               preempt_check_resched_rt();
25170
25171                 /* Send pending IPI's to kick RPS processing on remote cpus. */
25172                 while (remsd) {
25173 @@ -4808,6 +4824,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
25174         } else
25175  #endif
25176                 local_irq_enable();
25177 +       preempt_check_resched_rt();
25178  }
25179
25180  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
25181 @@ -4889,6 +4906,7 @@ void __napi_schedule(struct napi_struct *n)
25182         local_irq_save(flags);
25183         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
25184         local_irq_restore(flags);
25185 +       preempt_check_resched_rt();
25186  }
25187  EXPORT_SYMBOL(__napi_schedule);
25188
25189 @@ -5229,7 +5247,7 @@ static void net_rx_action(struct softirq_action *h)
25190         list_splice_tail(&repoll, &list);
25191         list_splice(&list, &sd->poll_list);
25192         if (!list_empty(&sd->poll_list))
25193 -               __raise_softirq_irqoff(NET_RX_SOFTIRQ);
25194 +               __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
25195
25196         net_rps_action_and_irq_enable(sd);
25197  }
25198 @@ -7736,7 +7754,7 @@ EXPORT_SYMBOL(free_netdev);
25199  void synchronize_net(void)
25200  {
25201         might_sleep();
25202 -       if (rtnl_is_locked())
25203 +       if (rtnl_is_locked() && !IS_ENABLED(CONFIG_PREEMPT_RT_FULL))
25204                 synchronize_rcu_expedited();
25205         else
25206                 synchronize_rcu();
25207 @@ -7977,16 +7995,20 @@ static int dev_cpu_callback(struct notifier_block *nfb,
25208
25209         raise_softirq_irqoff(NET_TX_SOFTIRQ);
25210         local_irq_enable();
25211 +       preempt_check_resched_rt();
25212
25213         /* Process offline CPU's input_pkt_queue */
25214         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
25215                 netif_rx_ni(skb);
25216                 input_queue_head_incr(oldsd);
25217         }
25218 -       while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
25219 +       while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
25220                 netif_rx_ni(skb);
25221                 input_queue_head_incr(oldsd);
25222         }
25223 +       while ((skb = __skb_dequeue(&oldsd->tofree_queue))) {
25224 +               kfree_skb(skb);
25225 +       }
25226
25227         return NOTIFY_OK;
25228  }
25229 @@ -8288,8 +8310,9 @@ static int __init net_dev_init(void)
25230         for_each_possible_cpu(i) {
25231                 struct softnet_data *sd = &per_cpu(softnet_data, i);
25232
25233 -               skb_queue_head_init(&sd->input_pkt_queue);
25234 -               skb_queue_head_init(&sd->process_queue);
25235 +               skb_queue_head_init_raw(&sd->input_pkt_queue);
25236 +               skb_queue_head_init_raw(&sd->process_queue);
25237 +               skb_queue_head_init_raw(&sd->tofree_queue);
25238                 INIT_LIST_HEAD(&sd->poll_list);
25239                 sd->output_queue_tailp = &sd->output_queue;
25240  #ifdef CONFIG_RPS
25241 diff --git a/net/core/filter.c b/net/core/filter.c
25242 index cb06aceb512a..3585a8982287 100644
25243 --- a/net/core/filter.c
25244 +++ b/net/core/filter.c
25245 @@ -1592,7 +1592,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
25246  {
25247         int ret;
25248
25249 -       if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
25250 +       if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) {
25251                 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
25252                 kfree_skb(skb);
25253                 return -ENETDOWN;
25254 @@ -1600,9 +1600,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
25255
25256         skb->dev = dev;
25257
25258 -       __this_cpu_inc(xmit_recursion);
25259 +       xmit_rec_inc();
25260         ret = dev_queue_xmit(skb);
25261 -       __this_cpu_dec(xmit_recursion);
25262 +       xmit_rec_dec();
25263
25264         return ret;
25265  }
25266 diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
25267 index cad8e791f28e..2a9364fe62a5 100644
25268 --- a/net/core/gen_estimator.c
25269 +++ b/net/core/gen_estimator.c
25270 @@ -84,7 +84,7 @@ struct gen_estimator
25271         struct gnet_stats_basic_packed  *bstats;
25272         struct gnet_stats_rate_est64    *rate_est;
25273         spinlock_t              *stats_lock;
25274 -       seqcount_t              *running;
25275 +       net_seqlock_t           *running;
25276         int                     ewma_log;
25277         u32                     last_packets;
25278         unsigned long           avpps;
25279 @@ -213,7 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
25280                       struct gnet_stats_basic_cpu __percpu *cpu_bstats,
25281                       struct gnet_stats_rate_est64 *rate_est,
25282                       spinlock_t *stats_lock,
25283 -                     seqcount_t *running,
25284 +                     net_seqlock_t *running,
25285                       struct nlattr *opt)
25286  {
25287         struct gen_estimator *est;
25288 @@ -309,7 +309,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
25289                           struct gnet_stats_basic_cpu __percpu *cpu_bstats,
25290                           struct gnet_stats_rate_est64 *rate_est,
25291                           spinlock_t *stats_lock,
25292 -                         seqcount_t *running, struct nlattr *opt)
25293 +                         net_seqlock_t *running, struct nlattr *opt)
25294  {
25295         gen_kill_estimator(bstats, rate_est);
25296         return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
25297 diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
25298 index 508e051304fb..bc3b17b78c94 100644
25299 --- a/net/core/gen_stats.c
25300 +++ b/net/core/gen_stats.c
25301 @@ -130,7 +130,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
25302  }
25303
25304  void
25305 -__gnet_stats_copy_basic(const seqcount_t *running,
25306 +__gnet_stats_copy_basic(net_seqlock_t *running,
25307                         struct gnet_stats_basic_packed *bstats,
25308                         struct gnet_stats_basic_cpu __percpu *cpu,
25309                         struct gnet_stats_basic_packed *b)
25310 @@ -143,10 +143,10 @@ __gnet_stats_copy_basic(const seqcount_t *running,
25311         }
25312         do {
25313                 if (running)
25314 -                       seq = read_seqcount_begin(running);
25315 +                       seq = net_seq_begin(running);
25316                 bstats->bytes = b->bytes;
25317                 bstats->packets = b->packets;
25318 -       } while (running && read_seqcount_retry(running, seq));
25319 +       } while (running && net_seq_retry(running, seq));
25320  }
25321  EXPORT_SYMBOL(__gnet_stats_copy_basic);
25322
25323 @@ -164,7 +164,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
25324   * if the room in the socket buffer was not sufficient.
25325   */
25326  int
25327 -gnet_stats_copy_basic(const seqcount_t *running,
25328 +gnet_stats_copy_basic(net_seqlock_t *running,
25329                       struct gnet_dump *d,
25330                       struct gnet_stats_basic_cpu __percpu *cpu,
25331                       struct gnet_stats_basic_packed *b)
25332 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
25333 index 3864b4b68fa1..55c73ade9faa 100644
25334 --- a/net/core/skbuff.c
25335 +++ b/net/core/skbuff.c
25336 @@ -64,6 +64,7 @@
25337  #include <linux/errqueue.h>
25338  #include <linux/prefetch.h>
25339  #include <linux/if_vlan.h>
25340 +#include <linux/locallock.h>
25341
25342  #include <net/protocol.h>
25343  #include <net/dst.h>
25344 @@ -360,6 +361,8 @@ struct napi_alloc_cache {
25345
25346  static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
25347  static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
25348 +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock);
25349 +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock);
25350
25351  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25352  {
25353 @@ -367,10 +370,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25354         unsigned long flags;
25355         void *data;
25356
25357 -       local_irq_save(flags);
25358 +       local_lock_irqsave(netdev_alloc_lock, flags);
25359         nc = this_cpu_ptr(&netdev_alloc_cache);
25360         data = __alloc_page_frag(nc, fragsz, gfp_mask);
25361 -       local_irq_restore(flags);
25362 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
25363         return data;
25364  }
25365
25366 @@ -389,9 +392,13 @@ EXPORT_SYMBOL(netdev_alloc_frag);
25367
25368  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
25369  {
25370 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25371 +       struct napi_alloc_cache *nc;
25372 +       void *data;
25373
25374 -       return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
25375 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25376 +       data = __alloc_page_frag(&nc->page, fragsz, gfp_mask);
25377 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25378 +       return data;
25379  }
25380
25381  void *napi_alloc_frag(unsigned int fragsz)
25382 @@ -438,13 +445,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
25383         if (sk_memalloc_socks())
25384                 gfp_mask |= __GFP_MEMALLOC;
25385
25386 -       local_irq_save(flags);
25387 +       local_lock_irqsave(netdev_alloc_lock, flags);
25388
25389         nc = this_cpu_ptr(&netdev_alloc_cache);
25390         data = __alloc_page_frag(nc, len, gfp_mask);
25391         pfmemalloc = nc->pfmemalloc;
25392
25393 -       local_irq_restore(flags);
25394 +       local_unlock_irqrestore(netdev_alloc_lock, flags);
25395
25396         if (unlikely(!data))
25397                 return NULL;
25398 @@ -485,9 +492,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
25399  struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25400                                  gfp_t gfp_mask)
25401  {
25402 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25403 +       struct napi_alloc_cache *nc;
25404         struct sk_buff *skb;
25405         void *data;
25406 +       bool pfmemalloc;
25407
25408         len += NET_SKB_PAD + NET_IP_ALIGN;
25409
25410 @@ -505,7 +513,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25411         if (sk_memalloc_socks())
25412                 gfp_mask |= __GFP_MEMALLOC;
25413
25414 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25415         data = __alloc_page_frag(&nc->page, len, gfp_mask);
25416 +       pfmemalloc = nc->page.pfmemalloc;
25417 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25418         if (unlikely(!data))
25419                 return NULL;
25420
25421 @@ -516,7 +527,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
25422         }
25423
25424         /* use OR instead of assignment to avoid clearing of bits in mask */
25425 -       if (nc->page.pfmemalloc)
25426 +       if (pfmemalloc)
25427                 skb->pfmemalloc = 1;
25428         skb->head_frag = 1;
25429
25430 @@ -760,23 +771,26 @@ EXPORT_SYMBOL(consume_skb);
25431
25432  void __kfree_skb_flush(void)
25433  {
25434 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25435 +       struct napi_alloc_cache *nc;
25436
25437 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25438         /* flush skb_cache if containing objects */
25439         if (nc->skb_count) {
25440                 kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
25441                                      nc->skb_cache);
25442                 nc->skb_count = 0;
25443         }
25444 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25445  }
25446
25447  static inline void _kfree_skb_defer(struct sk_buff *skb)
25448  {
25449 -       struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
25450 +       struct napi_alloc_cache *nc;
25451
25452         /* drop skb->head and call any destructors for packet */
25453         skb_release_all(skb);
25454
25455 +       nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25456         /* record skb to CPU local list */
25457         nc->skb_cache[nc->skb_count++] = skb;
25458
25459 @@ -791,6 +805,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb)
25460                                      nc->skb_cache);
25461                 nc->skb_count = 0;
25462         }
25463 +       put_locked_var(napi_alloc_cache_lock, napi_alloc_cache);
25464  }
25465  void __kfree_skb_defer(struct sk_buff *skb)
25466  {
25467 diff --git a/net/core/sock.c b/net/core/sock.c
25468 index fd7b41edf1ce..e425d259a9f0 100644
25469 --- a/net/core/sock.c
25470 +++ b/net/core/sock.c
25471 @@ -2508,12 +2508,11 @@ void lock_sock_nested(struct sock *sk, int subclass)
25472         if (sk->sk_lock.owned)
25473                 __lock_sock(sk);
25474         sk->sk_lock.owned = 1;
25475 -       spin_unlock(&sk->sk_lock.slock);
25476 +       spin_unlock_bh(&sk->sk_lock.slock);
25477         /*
25478          * The sk_lock has mutex_lock() semantics here:
25479          */
25480         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
25481 -       local_bh_enable();
25482  }
25483  EXPORT_SYMBOL(lock_sock_nested);
25484
25485 diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
25486 index 38abe70e595f..443259a04862 100644
25487 --- a/net/ipv4/icmp.c
25488 +++ b/net/ipv4/icmp.c
25489 @@ -69,6 +69,7 @@
25490  #include <linux/jiffies.h>
25491  #include <linux/kernel.h>
25492  #include <linux/fcntl.h>
25493 +#include <linux/sysrq.h>
25494  #include <linux/socket.h>
25495  #include <linux/in.h>
25496  #include <linux/inet.h>
25497 @@ -77,6 +78,7 @@
25498  #include <linux/string.h>
25499  #include <linux/netfilter_ipv4.h>
25500  #include <linux/slab.h>
25501 +#include <linux/locallock.h>
25502  #include <net/snmp.h>
25503  #include <net/ip.h>
25504  #include <net/route.h>
25505 @@ -204,6 +206,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
25506   *
25507   *     On SMP we have one ICMP socket per-cpu.
25508   */
25509 +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock);
25510 +
25511  static struct sock *icmp_sk(struct net *net)
25512  {
25513         return *this_cpu_ptr(net->ipv4.icmp_sk);
25514 @@ -215,12 +219,14 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
25515
25516         local_bh_disable();
25517
25518 +       local_lock(icmp_sk_lock);
25519         sk = icmp_sk(net);
25520
25521         if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
25522                 /* This can happen if the output path signals a
25523                  * dst_link_failure() for an outgoing ICMP packet.
25524                  */
25525 +               local_unlock(icmp_sk_lock);
25526                 local_bh_enable();
25527                 return NULL;
25528         }
25529 @@ -230,6 +236,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
25530  static inline void icmp_xmit_unlock(struct sock *sk)
25531  {
25532         spin_unlock_bh(&sk->sk_lock.slock);
25533 +       local_unlock(icmp_sk_lock);
25534  }
25535
25536  int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
25537 @@ -358,6 +365,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
25538         struct sock *sk;
25539         struct sk_buff *skb;
25540
25541 +       local_lock(icmp_sk_lock);
25542         sk = icmp_sk(dev_net((*rt)->dst.dev));
25543         if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
25544                            icmp_param->data_len+icmp_param->head_len,
25545 @@ -380,6 +388,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
25546                 skb->ip_summed = CHECKSUM_NONE;
25547                 ip_push_pending_frames(sk, fl4);
25548         }
25549 +       local_unlock(icmp_sk_lock);
25550  }
25551
25552  /*
25553 @@ -891,6 +900,30 @@ static bool icmp_redirect(struct sk_buff *skb)
25554  }
25555
25556  /*
25557 + * 32bit and 64bit have different timestamp length, so we check for
25558 + * the cookie at offset 20 and verify it is repeated at offset 50
25559 + */
25560 +#define CO_POS0                20
25561 +#define CO_POS1                50
25562 +#define CO_SIZE                sizeof(int)
25563 +#define ICMP_SYSRQ_SIZE        57
25564 +
25565 +/*
25566 + * We got a ICMP_SYSRQ_SIZE sized ping request. Check for the cookie
25567 + * pattern and if it matches send the next byte as a trigger to sysrq.
25568 + */
25569 +static void icmp_check_sysrq(struct net *net, struct sk_buff *skb)
25570 +{
25571 +       int cookie = htonl(net->ipv4.sysctl_icmp_echo_sysrq);
25572 +       char *p = skb->data;
25573 +
25574 +       if (!memcmp(&cookie, p + CO_POS0, CO_SIZE) &&
25575 +           !memcmp(&cookie, p + CO_POS1, CO_SIZE) &&
25576 +           p[CO_POS0 + CO_SIZE] == p[CO_POS1 + CO_SIZE])
25577 +               handle_sysrq(p[CO_POS0 + CO_SIZE]);
25578 +}
25579 +
25580 +/*
25581   *     Handle ICMP_ECHO ("ping") requests.
25582   *
25583   *     RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
25584 @@ -917,6 +950,11 @@ static bool icmp_echo(struct sk_buff *skb)
25585                 icmp_param.data_len        = skb->len;
25586                 icmp_param.head_len        = sizeof(struct icmphdr);
25587                 icmp_reply(&icmp_param, skb);
25588 +
25589 +               if (skb->len == ICMP_SYSRQ_SIZE &&
25590 +                   net->ipv4.sysctl_icmp_echo_sysrq) {
25591 +                       icmp_check_sysrq(net, skb);
25592 +               }
25593         }
25594         /* should there be an ICMP stat for ignored echos? */
25595         return true;
25596 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
25597 index 1cb67de106fe..332a485323f0 100644
25598 --- a/net/ipv4/sysctl_net_ipv4.c
25599 +++ b/net/ipv4/sysctl_net_ipv4.c
25600 @@ -681,6 +681,13 @@ static struct ctl_table ipv4_net_table[] = {
25601                 .proc_handler   = proc_dointvec
25602         },
25603         {
25604 +               .procname       = "icmp_echo_sysrq",
25605 +               .data           = &init_net.ipv4.sysctl_icmp_echo_sysrq,
25606 +               .maxlen         = sizeof(int),
25607 +               .mode           = 0644,
25608 +               .proc_handler   = proc_dointvec
25609 +       },
25610 +       {
25611                 .procname       = "icmp_ignore_bogus_error_responses",
25612                 .data           = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
25613                 .maxlen         = sizeof(int),
25614 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
25615 index 7158d4f8dae4..0dc007fc6704 100644
25616 --- a/net/ipv4/tcp_ipv4.c
25617 +++ b/net/ipv4/tcp_ipv4.c
25618 @@ -62,6 +62,7 @@
25619  #include <linux/init.h>
25620  #include <linux/times.h>
25621  #include <linux/slab.h>
25622 +#include <linux/locallock.h>
25623
25624  #include <net/net_namespace.h>
25625  #include <net/icmp.h>
25626 @@ -565,6 +566,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
25627  }
25628  EXPORT_SYMBOL(tcp_v4_send_check);
25629
25630 +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock);
25631  /*
25632   *     This routine will send an RST to the other tcp.
25633   *
25634 @@ -692,6 +694,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
25635                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
25636
25637         arg.tos = ip_hdr(skb)->tos;
25638 +
25639 +       local_lock(tcp_sk_lock);
25640         local_bh_disable();
25641         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
25642                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
25643 @@ -701,6 +705,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
25644         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
25645         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
25646         local_bh_enable();
25647 +       local_unlock(tcp_sk_lock);
25648
25649  #ifdef CONFIG_TCP_MD5SIG
25650  out:
25651 @@ -776,6 +781,7 @@ static void tcp_v4_send_ack(struct net *net,
25652         if (oif)
25653                 arg.bound_dev_if = oif;
25654         arg.tos = tos;
25655 +       local_lock(tcp_sk_lock);
25656         local_bh_disable();
25657         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
25658                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
25659 @@ -784,6 +790,7 @@ static void tcp_v4_send_ack(struct net *net,
25660
25661         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
25662         local_bh_enable();
25663 +       local_unlock(tcp_sk_lock);
25664  }
25665
25666  static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
25667 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
25668 index 9dce3b157908..525efa5309ac 100644
25669 --- a/net/mac80211/rx.c
25670 +++ b/net/mac80211/rx.c
25671 @@ -4064,7 +4064,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
25672         struct ieee80211_supported_band *sband;
25673         struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
25674
25675 -       WARN_ON_ONCE(softirq_count() == 0);
25676 +       WARN_ON_ONCE_NONRT(softirq_count() == 0);
25677
25678         if (WARN_ON(status->band >= NUM_NL80211_BANDS))
25679                 goto drop;
25680 diff --git a/net/netfilter/core.c b/net/netfilter/core.c
25681 index f39276d1c2d7..10880c89d62f 100644
25682 --- a/net/netfilter/core.c
25683 +++ b/net/netfilter/core.c
25684 @@ -22,11 +22,17 @@
25685  #include <linux/proc_fs.h>
25686  #include <linux/mutex.h>
25687  #include <linux/slab.h>
25688 +#include <linux/locallock.h>
25689  #include <net/net_namespace.h>
25690  #include <net/sock.h>
25691
25692  #include "nf_internals.h"
25693
25694 +#ifdef CONFIG_PREEMPT_RT_BASE
25695 +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock);
25696 +EXPORT_PER_CPU_SYMBOL(xt_write_lock);
25697 +#endif
25698 +
25699  static DEFINE_MUTEX(afinfo_mutex);
25700
25701  const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
25702 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
25703 index 33a4697d5539..475cb74bf825 100644
25704 --- a/net/packet/af_packet.c
25705 +++ b/net/packet/af_packet.c
25706 @@ -63,6 +63,7 @@
25707  #include <linux/if_packet.h>
25708  #include <linux/wireless.h>
25709  #include <linux/kernel.h>
25710 +#include <linux/delay.h>
25711  #include <linux/kmod.h>
25712  #include <linux/slab.h>
25713  #include <linux/vmalloc.h>
25714 @@ -695,7 +696,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data)
25715         if (BLOCK_NUM_PKTS(pbd)) {
25716                 while (atomic_read(&pkc->blk_fill_in_prog)) {
25717                         /* Waiting for skb_copy_bits to finish... */
25718 -                       cpu_relax();
25719 +                       cpu_chill();
25720                 }
25721         }
25722
25723 @@ -957,7 +958,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
25724                 if (!(status & TP_STATUS_BLK_TMO)) {
25725                         while (atomic_read(&pkc->blk_fill_in_prog)) {
25726                                 /* Waiting for skb_copy_bits to finish... */
25727 -                               cpu_relax();
25728 +                               cpu_chill();
25729                         }
25730                 }
25731                 prb_close_block(pkc, pbd, po, status);
25732 diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
25733 index 977f69886c00..f3e7a36b0396 100644
25734 --- a/net/rds/ib_rdma.c
25735 +++ b/net/rds/ib_rdma.c
25736 @@ -34,6 +34,7 @@
25737  #include <linux/slab.h>
25738  #include <linux/rculist.h>
25739  #include <linux/llist.h>
25740 +#include <linux/delay.h>
25741
25742  #include "rds_single_path.h"
25743  #include "ib_mr.h"
25744 @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void)
25745         for_each_online_cpu(cpu) {
25746                 flag = &per_cpu(clean_list_grace, cpu);
25747                 while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
25748 -                       cpu_relax();
25749 +                       cpu_chill();
25750         }
25751  }
25752
25753 diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
25754 index 814d285ff802..d4d088e9be85 100644
25755 --- a/net/rxrpc/security.c
25756 +++ b/net/rxrpc/security.c
25757 @@ -19,9 +19,6 @@
25758  #include <keys/rxrpc-type.h>
25759  #include "ar-internal.h"
25760
25761 -static LIST_HEAD(rxrpc_security_methods);
25762 -static DECLARE_RWSEM(rxrpc_security_sem);
25763 -
25764  static const struct rxrpc_security *rxrpc_security_types[] = {
25765         [RXRPC_SECURITY_NONE]   = &rxrpc_no_security,
25766  #ifdef CONFIG_RXKAD
25767 diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
25768 index 12ebde845523..99f3ce50c6c4 100644
25769 --- a/net/sched/sch_api.c
25770 +++ b/net/sched/sch_api.c
25771 @@ -975,7 +975,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
25772                         rcu_assign_pointer(sch->stab, stab);
25773                 }
25774                 if (tca[TCA_RATE]) {
25775 -                       seqcount_t *running;
25776 +                       net_seqlock_t *running;
25777
25778                         err = -EOPNOTSUPP;
25779                         if (sch->flags & TCQ_F_MQROOT)
25780 diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
25781 index 657c13362b19..cbab8d4d5864 100644
25782 --- a/net/sched/sch_generic.c
25783 +++ b/net/sched/sch_generic.c
25784 @@ -426,7 +426,11 @@ struct Qdisc noop_qdisc = {
25785         .list           =       LIST_HEAD_INIT(noop_qdisc.list),
25786         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
25787         .dev_queue      =       &noop_netdev_queue,
25788 +#ifdef CONFIG_PREEMPT_RT_BASE
25789 +       .running        =       __SEQLOCK_UNLOCKED(noop_qdisc.running),
25790 +#else
25791         .running        =       SEQCNT_ZERO(noop_qdisc.running),
25792 +#endif
25793         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
25794  };
25795  EXPORT_SYMBOL(noop_qdisc);
25796 @@ -620,9 +624,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
25797         lockdep_set_class(&sch->busylock,
25798                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
25799
25800 +#ifdef CONFIG_PREEMPT_RT_BASE
25801 +       seqlock_init(&sch->running);
25802 +       lockdep_set_class(&sch->running.seqcount,
25803 +                         dev->qdisc_running_key ?: &qdisc_running_key);
25804 +       lockdep_set_class(&sch->running.lock,
25805 +                         dev->qdisc_running_key ?: &qdisc_running_key);
25806 +#else
25807         seqcount_init(&sch->running);
25808         lockdep_set_class(&sch->running,
25809                           dev->qdisc_running_key ?: &qdisc_running_key);
25810 +#endif
25811
25812         sch->ops = ops;
25813         sch->enqueue = ops->enqueue;
25814 @@ -917,7 +929,7 @@ void dev_deactivate_many(struct list_head *head)
25815         /* Wait for outstanding qdisc_run calls. */
25816         list_for_each_entry(dev, head, close_list)
25817                 while (some_qdisc_is_busy(dev))
25818 -                       yield();
25819 +                       msleep(1);
25820  }
25821
25822  void dev_deactivate(struct net_device *dev)
25823 diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
25824 index c3f652395a80..2dd84493528e 100644
25825 --- a/net/sunrpc/svc_xprt.c
25826 +++ b/net/sunrpc/svc_xprt.c
25827 @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
25828                 goto out;
25829         }
25830
25831 -       cpu = get_cpu();
25832 +       cpu = get_cpu_light();
25833         pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
25834
25835         atomic_long_inc(&pool->sp_stats.packets);
25836 @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
25837
25838                 atomic_long_inc(&pool->sp_stats.threads_woken);
25839                 wake_up_process(rqstp->rq_task);
25840 -               put_cpu();
25841 +               put_cpu_light();
25842                 goto out;
25843         }
25844         rcu_read_unlock();
25845 @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
25846                 goto redo_search;
25847         }
25848         rqstp = NULL;
25849 -       put_cpu();
25850 +       put_cpu_light();
25851  out:
25852         trace_svc_xprt_do_enqueue(xprt, rqstp);
25853  }
25854 diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
25855 index 6fdc97ef6023..523e0420d7f0 100755
25856 --- a/scripts/mkcompile_h
25857 +++ b/scripts/mkcompile_h
25858 @@ -4,7 +4,8 @@ TARGET=$1
25859  ARCH=$2
25860  SMP=$3
25861  PREEMPT=$4
25862 -CC=$5
25863 +RT=$5
25864 +CC=$6
25865
25866  vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; }
25867
25868 @@ -57,6 +58,7 @@ UTS_VERSION="#$VERSION"
25869  CONFIG_FLAGS=""
25870  if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
25871  if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
25872 +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi
25873  UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
25874
25875  # Truncate to maximum length
25876 diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
25877 index c61fd50f771f..1583de410f62 100644
25878 --- a/sound/core/pcm_native.c
25879 +++ b/sound/core/pcm_native.c
25880 @@ -135,7 +135,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock);
25881  void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream)
25882  {
25883         if (!substream->pcm->nonatomic)
25884 -               local_irq_disable();
25885 +               local_irq_disable_nort();
25886         snd_pcm_stream_lock(substream);
25887  }
25888  EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq);
25889 @@ -150,7 +150,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream)
25890  {
25891         snd_pcm_stream_unlock(substream);
25892         if (!substream->pcm->nonatomic)
25893 -               local_irq_enable();
25894 +               local_irq_enable_nort();
25895  }
25896  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq);
25897
25898 @@ -158,7 +158,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream)
25899  {
25900         unsigned long flags = 0;
25901         if (!substream->pcm->nonatomic)
25902 -               local_irq_save(flags);
25903 +               local_irq_save_nort(flags);
25904         snd_pcm_stream_lock(substream);
25905         return flags;
25906  }
25907 @@ -176,7 +176,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream,
25908  {
25909         snd_pcm_stream_unlock(substream);
25910         if (!substream->pcm->nonatomic)
25911 -               local_irq_restore(flags);
25912 +               local_irq_restore_nort(flags);
25913  }
25914  EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore);
25915